# Machine Learning NLP notebook for idenitfying ML methods papers in life science jorunal 
## 1. 

In [10]:
# 1. Preprocess DOME data & get insights from the literature data for use with developing ML lit triage model 

# 1A. DOME abstract and title

# 1. Import Necessary Libraries 
# # Ensures that all required NLP libraries (NLTK, SpaCy, Scikit-learn, etc.) are available.  
import pandas as pd
import numpy as np
import re
import nltk
import sklearn
import os
from os import listdir
from os.path import isfile, join
# import spacy - depedncy issues avoid for now 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#nltk.download('punkt')
#nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from gensim.models import Word2Vec

# Download required datasets
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
import scipy.sparse
import joblib

# 2. Load Data
# Load the DOME abstract and title data
# Read in names of PMC files in title and abstract folder + put into variable 
title_abstract_names = os.listdir('./DOME_Registry_PMC_Title_Abstract')
# Read in the text from each file and put into a new list
title_abstract_names_list = []
for file in title_abstract_names:
    with open('./DOME_Registry_PMC_Title_Abstract/' + file, 'r') as f:
        title_abstract_names_list.append(f.read())
        
# 2. Text Cleaning & Normalization  
# 2.1 Lowercasing to enable case-insensitive matching.  
def lowercasing(text):
    return text.lower()

clean1_title_abstract = []
for title_abstract in title_abstract_names_list:
    clean1_title_abstract.append(lowercasing(title_abstract))

# print(lower_title_abstract[4])

# 2.2 Remove Special Characters & Numbers --> do not do for full text
def remove_special_chars_numbers(text):
    # Remove numbers and special characters, keeping only letters and spaces
    cleaned_text = re.sub(r'[^A-Za-z\s]', ' ', text)
    return cleaned_text

clean2_title_abstract = []
for title_abstract in clean1_title_abstract:
    clean2_title_abstract.append(remove_special_chars_numbers(title_abstract))
    
#print(clean2_title_abstract[0])

# 2.3 Remove double white spaces
def remove_extra_whitespace(text):
    return ' '.join(text.split())

clean3_title_abstract = []
for title_abstract in clean2_title_abstract:
    clean3_title_abstract.append(remove_extra_whitespace(title_abstract))

#print(clean3_title_abstract[0])

# 3. Tokenization & Basic Cleaning  
# 3.1 Tokenization - Splits text into individual words can also try subwords.). 
# #Tokenizer chosen also removes punctuation
clean4_title_abstract = []

for title_abstract in clean3_title_abstract:
    clean4_title_abstract.append(word_tokenize(title_abstract))

#print(clean4_title_abstract[180])

# #3.2 R (Step 1.3) → Ensures proper word separation. - ignore for the moment, address if tokenisation issues 

# 3.3 Removing Stopwords - Eliminates commonly occurring but uninformative words.  
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word not in stop_words]

clean5_title_abstract = []
for title_abstract in clean4_title_abstract:
    clean5_title_abstract.append(remove_stopwords(title_abstract))

# print(clean5_title_abstract[180])

# 4. Lemmatization (& Stemming - where would be preferred)  
# Lemmatization → Converts words to their root form.  
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

clean6_title_abstract = []
for title_abstract in clean5_title_abstract:
    clean6_title_abstract.append(lemmatize(title_abstract))

#print(clean5_title_abstract[180])
#print(clean6_title_abstract[180])

# 5. Feature Extraction (Reorganized for clarity)  
# 5.1 Part-of-Speech (POS) Tagging & Counts  


# 5.2 Named Entity Recognition (NER)  

# 5.3 Term Frequency - Inverse Document Frequency - (TF-IDF) (Step 1.6.3)  
# Try on doc basis and also on full corpus basis

# 5.3.1 Term frequency → Measures how frequently a term occurs in a document.
def compute_term_frequency(documents):
    vectorizer = CountVectorizer()
    tf_matrix = vectorizer.fit_transform(documents)
    return tf_matrix, vectorizer.get_feature_names_out()

# 5.3.2 IDF → Measures how important a term is within a corpus.
def compute_inverse_document_frequency(tf_matrix):
    transformer = TfidfTransformer(norm=None, use_idf=True)
    transformer.fit(tf_matrix)
    idf = transformer.idf_
    return idf

# 5.3.3 TF-IDF → Combines the above two metrics to determine the importance of a term in a document relative to a corpus.
def compute_tf_idf(documents):
    vectorizer = TfidfVectorizer()
    tf_idf_matrix = vectorizer.fit_transform(documents)
    return tf_idf_matrix, vectorizer.get_feature_names_out()

# Example usage:
documents = [" ".join(doc) for doc in clean6_title_abstract]
tf_matrix, terms = compute_term_frequency(documents)
idf = compute_inverse_document_frequency(tf_matrix)
tf_idf_matrix, tf_idf_terms = compute_tf_idf(documents)

print("TF Matrix Shape:", tf_matrix.shape)
print("IDF Shape:", idf.shape)
print("TF-IDF Matrix Shape:", tf_idf_matrix.shape)

# Save the matrices and terms
output_dir = './DOME_Registry_PMC_Title_Abstract_Analysis'
os.makedirs(output_dir, exist_ok=True)

# Convert sparse matrix to dense format for saving as CSV
tf_matrix_dense = tf_matrix.toarray()
tf_idf_matrix_dense = tf_idf_matrix.toarray()

# Save as CSV
pd.DataFrame(tf_matrix_dense, columns=terms).to_csv(os.path.join(output_dir, 'tf_matrix.csv'), index=False)
pd.DataFrame({'term': terms, 'idf': idf}).to_csv(os.path.join(output_dir, 'idf.csv'), index=False)
pd.DataFrame(tf_idf_matrix_dense, columns=tf_idf_terms).to_csv(os.path.join(output_dir, 'tf_idf_matrix.csv'), index=False)

# 5.4 Bag of Words (BoW)
def compute_bag_of_words(documents):
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(documents)
    return bow_matrix, vectorizer.get_feature_names_out()

# BOW usage:
bow_matrix, bow_terms = compute_bag_of_words(documents)
print("BoW Matrix Shape:", bow_matrix.shape)

# Save BoW matrix as CSV
bow_matrix_dense = bow_matrix.toarray()
pd.DataFrame(bow_matrix_dense, columns=bow_terms).to_csv(os.path.join(output_dir, 'bow_matrix.csv'), index=False)

# 5.4 Word Embeddings (Word2Vec, GloVe, BERT, etc.)
def compute_word2vec_gensim(documents):
    model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Example usage:
word2vec_model = compute_word2vec_gensim(clean6_title_abstract)
print("Word2Vec Model Vocabulary Size:", len(word2vec_model.wv))

# Display the vector for a sample word
sample_word = 'machine'
if sample_word in word2vec_model.wv:
    print(f"Vector for '{sample_word}':", word2vec_model.wv[sample_word])
else:
    print(f"'{sample_word}' not in vocabulary")

# 6. Vectorization (Final Step)  
# ✔ TF-IDF OR Embeddings (Step 1.9) → Converts text into a numerical representation suitable for ML models.  
# (Vectorization is technically part of feature extraction, so this step can be merged with 1.6 if preferred.)

# Try title only and then abstarct only


# 1B. DOME full text 


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gavinfarrell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/gavinfarrell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/gavinfarrell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/gavinfarrell/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gavinfarrell/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


TF Matrix Shape: (186, 4813)
IDF Shape: (4813,)
TF-IDF Matrix Shape: (186, 4813)
BoW Matrix Shape: (186, 4813)
Word2Vec Model Vocabulary Size: 4830
Vector for 'machine': [-2.07052842e-01  2.29632020e-01  1.69503182e-01  6.81412891e-02
  3.91420424e-02 -5.49060941e-01  1.50891185e-01  5.50810993e-01
 -2.24079937e-01 -2.26501480e-01 -1.44010678e-01 -4.49575156e-01
 -5.75649589e-02  4.28074747e-02 -4.90287691e-02 -1.35430828e-01
  6.46141395e-02 -3.73235047e-01 -2.56669745e-02 -5.13415635e-01
  7.68738016e-02  1.74924389e-01  2.45191067e-01 -1.32091582e-01
 -1.20081730e-01 -4.64878753e-02 -2.56311327e-01 -1.98453560e-01
 -2.62828141e-01 -2.60042660e-02  4.18170393e-01  1.05281964e-01
  5.78797534e-02 -1.71213642e-01 -6.27136528e-02  3.11336517e-01
  1.07119381e-01 -2.95320153e-01 -1.71882644e-01 -5.02573669e-01
  1.09168537e-01 -2.94993192e-01 -2.13022664e-01 -7.99899325e-02
  2.79208392e-01 -6.98935539e-02 -2.55676031e-01 -1.65186450e-02
  1.37909442e-01  2.14212939e-01  2.10443392e-01 -

In [None]:
# Identify using ml ontology and others rleevant ML words not within - eg: model types etc


In [None]:
# 2. Determine if wider corpus of ML papers needed - automatically find some papers and then also preprocess
# could dtermine using text word mining
#random papers form lit suggest or negatiev search of terms - eg noo model/ml etc 
#  

In [None]:
# 3. Preprocess all ML papers

In [None]:
# 4. Download all papers mentioning machine learning and AI from EPMC 

In [None]:
# 5. Deploy ML model to predict if a paper is about ML or not

In [None]:
# 6. Analyse top papers and journals insights from the literature data