## Config

In [1]:
import json
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
# Load the config file
with open('../config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

 ## Datasets

### Inference batch

In [3]:
# Define file path
file_name = "test_unlabeled.tsv"
final_path = os.path.join(file_path, file_name) 

# Load tsv file
inference_batch = pd.read_csv(final_path, sep='\t')
print(f"The inference batch has {inference_batch.shape[0]} observations and {inference_batch.shape[1]} columns.")
inference_batch.head()

The inference batch has 1097 observations and 4 columns.


Unnamed: 0,PMID,Title,Abstract,Label
0,34902587,Detection of porcine circovirus type 3 DNA in ...,Porcine circovirus type 3 (PCV3) is regularly ...,0
1,35451025,Imputation of non-genotyped F1 dams to improve...,This study investigated using imputed genotype...,0
2,34859764,Proposed multidimensional pain outcome methodo...,Castration of male piglets in the United State...,0
3,35143972,Nanostructured lipid carriers loaded with an a...,Alopecia is a condition associated with differ...,0
4,34872491,Genome-wide expression of the residual lung re...,BACKGROUND: Acute or chronic irreversible resp...,0


### Training Corpus

In [4]:
# Define file path
file_name = "QTL_text.json"
final_path = os.path.join(file_path, file_name) 

# Load json file
df = pd.read_json(final_path)
df = df.drop(columns=['Journal'])
print(f"Shape of the original dataset: {df.shape}", "\n")
df.head()

Shape of the original dataset: (11278, 4) 



Unnamed: 0,PMID,Title,Abstract,Category
0,17179536,Variance component analysis of quantitative tr...,"In a previous study, QTL for carcass compositi...",1
1,17177700,"Single nucleotide polymorphism identification,...",Pituitary adenylate cyclase-activating polypep...,0
2,17129674,Genetic resistance to Sarcocystis miescheriana...,Clinical and parasitological traits of Sarcocy...,0
3,17121599,Results of a whole-genome quantitative trait l...,A whole-genome quantitative trait locus (QTL) ...,1
4,17057239,Unexpected high polymorphism at the FABP4 gene...,Fatty acid bing protein 4 (FABP4) plays a key ...,0


### Pre-Processing

In [5]:
import spacy
nlp_spacy = spacy.load("en_core_web_sm")

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import re

import gensim
from gensim.models.phrases import Phraser, Phrases

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer, f1_score, recall_score, precision_score


# Set stop words
stop_words = set(stopwords.words('english'))

# Set lemmatizer
lemmatizer = WordNetLemmatizer()

# Defining NLP Pre-Processing steps. These steps were the one that highlighted the most on my exploration phase
def nlp_preprocessing(abstract_tokenized):
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.lower()) # Lowercasing txt
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.translate(str.maketrans("", "", string.punctuation))) # Removing punctuations from the text
    # abstract_tokenized = abstract_tokenized.apply(lambda doc: re.sub(r"\d+", "", doc)) # Removing numbers from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token for token in word_tokenize(doc) if token not in stop_words]) # Tokenizing and removing stop words from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [lemmatizer.lemmatize(token, pos=wordnet.NOUN) for token in doc]) # Convert words to their base
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token.strip() for token in doc if token.strip() and len(token)>1]) # Removing extra space
    return abstract_tokenized

# Function to join token lists into a string
def tokens_to_text(tokens_series):
    return tokens_series.apply(lambda tokens: " ".join(tokens))

# Key-phrase extraction
def gensim_key_phrase_extractor(list_of_tokens):
    bigram = Phraser(Phrases(list_of_tokens, min_count=2, threshold=15))
    bigram_token = [bigram[doc] for doc in list_of_tokens]
    return bigram_token

# Define a helper function to run experiments via cross-validation.
def run_experiment(X_text, y, pipeline, cv=5):
    scores = cross_val_score(pipeline, X_text, y, cv=cv, scoring='f1_macro')
    return scores.mean()

# Evaluate Classification Model
def evaluate_classification_model(y_true, y_pred):
    results = []

    # Evaluate model
    md_recall_score = round(recall_score(y_true, y_pred, average='macro'), 5)
    md_precision_score = round(precision_score(y_true, y_pred, average='macro'), 5)
    md_f1_score = round(f1_score(y_true, y_pred, average='macro'), 5)

    results.append({
            "recall_score": md_recall_score,
            "precision_score": md_precision_score,
            "f1_score": md_f1_score
            })

    # Transform results into DF
    results_df = pd.DataFrame(results)

    return results_df


[nltk_data] Downloading package punkt to /Users/gabrielvictorgomesferr
[nltk_data]     eira/opt/anaconda3/envs/nlp_env/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/gabrielvictorgomes
[nltk_data]     ferreira/opt/anaconda3/envs/nlp_env/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Applying pre-defined pre-processing steps
df_processed = df.copy()
df_processed['title_nltk_token'] = nlp_preprocessing(df_processed['Title'])
df_processed['abstract_nltk_token'] = nlp_preprocessing(df_processed['Abstract'])

# Extracting Bigrams
df_processed['title_bigram'] = gensim_key_phrase_extractor(df_processed['title_nltk_token'])
df_processed['abstract_bigram'] = gensim_key_phrase_extractor(df_processed['abstract_nltk_token'])

df_processed.head()

Unnamed: 0,PMID,Title,Abstract,Category,title_nltk_token,abstract_nltk_token,title_bigram,abstract_bigram
0,17179536,Variance component analysis of quantitative tr...,"In a previous study, QTL for carcass compositi...",1,"[variance, component, analysis, quantitative, ...","[previous, study, qtl, carcass, composition, m...","[variance_component, analysis, quantitative_tr...","[previous_study, qtl, carcass_composition, mea..."
1,17177700,"Single nucleotide polymorphism identification,...",Pituitary adenylate cyclase-activating polypep...,0,"[single, nucleotide, polymorphism, identificat...","[pituitary, adenylate, cyclaseactivating, poly...","[single_nucleotide, polymorphism, identificati...","[pituitary, adenylate, cyclaseactivating, poly..."
2,17129674,Genetic resistance to Sarcocystis miescheriana...,Clinical and parasitological traits of Sarcocy...,0,"[genetic, resistance, sarcocystis, miescherian...","[clinical, parasitological, trait, sarcocystis...","[genetic, resistance, sarcocystis, miescherian...","[clinical, parasitological, trait, sarcocystis..."
3,17121599,Results of a whole-genome quantitative trait l...,A whole-genome quantitative trait locus (QTL) ...,1,"[result, wholegenome, quantitative, trait, loc...","[wholegenome, quantitative, trait, locus, qtl,...","[result, wholegenome, quantitative_trait, locu...","[wholegenome, quantitative_trait, locus_qtl, s..."
4,17057239,Unexpected high polymorphism at the FABP4 gene...,Fatty acid bing protein 4 (FABP4) plays a key ...,0,"[unexpected, high, polymorphism, fabp4, gene, ...","[fatty, acid, bing, protein, fabp4, play, key,...","[unexpected, high, polymorphism, fabp4_gene, u...","[fatty_acid, bing, protein_fabp4, play_key, ro..."


### Train-Test Split

In [7]:
# Define predictor and target features
X = df_processed.drop(columns=['Category'])
y = df_processed['Category']

# Split train and test
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=.2, random_state=42, stratify=y)

## Experiments

In [28]:
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

# Normalize Overal Score
scaler = MinMaxScaler()

# Set different corpus representations
def set_experiment_features(df):
    experiments = {
        # "title": tokens_to_text(df['title_bigram']),
        # "title raw": df['Title'],
        # "abstract": tokens_to_text(df['abstract_bigram']),
        # "abstract raw": df['Abstract'],
        # "title + abstract": tokens_to_text(df['title_bigram']) + " " + tokens_to_text(X_train['abstract_bigram']),
        "title + abstract raw": df['Title'] + " " + df['Abstract']
    }
    return experiments

# Set different models to experiment
def set_experiment_models():

    model_pipelines = {
        # "Logistic Regression": Pipeline([
        #     ('tfidf', TfidfVectorizer(stop_words='english')),
        #     ('scale', MaxAbsScaler()),
        #     ('clf', LogisticRegression(solver='liblinear', random_state=42))
        # ]),
        "Linear SVM": Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('scale', MaxAbsScaler()),
            ('clf', LinearSVC(random_state=42))
        ])
    }

    return model_pipelines

In [29]:
experiments_train = set_experiment_features(X_train)
model_pipelines = set_experiment_models()

# Prepare a results list to collect performance metrics.
results = []

# Loop through each corpus and each model
for corpus_name, corpus in experiments_train.items():
    for model_name, pipeline in model_pipelines.items():
         score = run_experiment(corpus, y_train, pipeline)
         results.append({
             "Corpus": corpus_name,
             "Model": model_name,
             "CV_F1_macro": score
         })
         print(f"Corpus: {corpus_name}, Model: {model_name}, CV F1 Macro: {score:.3f}")

# Convert results to a DataFrame for a clear overview
results_df = pd.DataFrame(results).sort_values('CV_F1_macro', ascending=False).reset_index(drop=True)
results_df.head()

Corpus: title + abstract raw, Model: Linear SVM, CV F1 Macro: 0.862


Unnamed: 0,Corpus,Model,CV_F1_macro
0,title + abstract raw,Linear SVM,0.861863


#### Validation Set

In [32]:
best_method = results_df['Corpus'][0]

# Create different corpus representations:
experiments_val = set_experiment_features(X_val)

X_train_corpurs = experiments_train[best_method]
X_val_corpurs = experiments_val[best_method]

best_model = results_df['Model'][0]
best_pipeline = model_pipelines[best_model]

# Train the pipeline on the training data
best_pipeline.fit(X_train_corpurs, y_train)

# # Predict on the validation set
# y_prob = best_pipeline.predict_proba(X_val_corpurs)

# y_prob_positive = y_prob[:, 1]

# threshold = 0.1
# y_pred = np.where(y_prob_positive > threshold, 1, 0)


y_pred = best_pipeline.predict(X_val_corpurs)
print(pd.Series(y_pred).value_counts())

# Evaluate the predictions
evaluation_df = evaluate_classification_model(y_val, y_pred)
evaluation_df.insert(0, 'Model', best_model)
evaluation_df.insert(1, 'Corpus', best_method)

print("Validation Performance:")
evaluation_df.head()

0    2072
1     184
Name: count, dtype: int64
Validation Performance:


Unnamed: 0,Model,Corpus,recall_score,precision_score,f1_score
0,Linear SVM,title + abstract raw,0.86486,0.8953,0.87931


In [11]:
output_path = 'experiment_outputs'
os.makedirs(output_path, exist_ok=True)

exp_results_path = os.path.join(output_path, 'experiment_results.csv')
                                
experiments_history = pd.read_csv(exp_results_path)
experiments_history.head()

Unnamed: 0,Model,Corpus,recall_score,precision_score,f1_score
0,Linear SVM,title abstract raw,0.86486,0.8953,0.87931
1,Logistic Regression,title abstract raw,0.83198,0.90423,0.86369


In [12]:
experiments_history = pd.concat([experiments_history, evaluation_df], axis=0).sort_values('f1_score', ascending=False).reset_index(drop=True)
experiments_history

Unnamed: 0,Model,Corpus,recall_score,precision_score,f1_score
0,Linear SVM,title abstract raw,0.86486,0.8953,0.87931
1,Linear SVM,title + abstract raw,0.86486,0.8953,0.87931
2,Logistic Regression,title abstract raw,0.83198,0.90423,0.86369


## Export Experiment Results

In [13]:
experiments_history.to_csv(exp_results_path, index=False)

#### Inference Batch

In [31]:
inference_batch.shape
inference_batch.head()

Unnamed: 0,PMID,Title,Abstract,Label,title_nltk_token,abstract_nltk_token,title_bigram,abstract_bigram
0,34902587,Detection of porcine circovirus type 3 DNA in ...,Porcine circovirus type 3 (PCV3) is regularly ...,0,"[detection, porcine, circovirus, type, dna, se...","[porcine, circovirus, type, pcv3, regularly, r...","[detection, porcine_circovirus, type, dna, ser...","[porcine_circovirus, type_pcv3, regularly, rep..."
1,35451025,Imputation of non-genotyped F1 dams to improve...,This study investigated using imputed genotype...,0,"[imputation, nongenotyped, f1, dam, improve, g...","[study, investigated, using, imputed, genotype...","[imputation, nongenotyped, f1, dam, improve, g...","[study, investigated, using, imputed_genotype,..."
2,34859764,Proposed multidimensional pain outcome methodo...,Castration of male piglets in the United State...,0,"[proposed, multidimensional, pain, outcome, me...","[castration, male, piglet, united, state, cond...","[proposed, multidimensional, pain, outcome, me...","[castration, male, piglet, united_state, condu..."
3,35143972,Nanostructured lipid carriers loaded with an a...,Alopecia is a condition associated with differ...,0,"[nanostructured, lipid, carrier, loaded, assoc...","[alopecia, condition, associated, different, e...","[nanostructured, lipid, carrier, loaded, assoc...","[alopecia, condition, associated, different, e..."
4,34872491,Genome-wide expression of the residual lung re...,BACKGROUND: Acute or chronic irreversible resp...,0,"[genomewide, expression, residual, lung, react...","[background, acute, chronic, irreversible, res...","[genomewide, expression, residual, lung, react...","[background, acute_chronic, irreversible, resp..."


In [29]:
inference_batch['title_nltk_token'] = nlp_preprocessing(inference_batch['Title'])
inference_batch['abstract_nltk_token'] = nlp_preprocessing(inference_batch['Abstract'])

# Extracting Bigrams
inference_batch['title_bigram'] = gensim_key_phrase_extractor(inference_batch['title_nltk_token'])
inference_batch['abstract_bigram'] = gensim_key_phrase_extractor(inference_batch['abstract_nltk_token'])

# Create different corpus representations:
experiments_inference_batch = set_experiment_features(inference_batch)

In [32]:
best_method = results_df['Corpus'][0]
print(best_method)
inference_corpurs = experiments_inference_batch[best_method]
inference_corpurs

title + abstract raw


0       Detection of porcine circovirus type 3 DNA in ...
1       Imputation of non-genotyped F1 dams to improve...
2       Proposed multidimensional pain outcome methodo...
3       Nanostructured lipid carriers loaded with an a...
4       Genome-wide expression of the residual lung re...
                              ...                        
1092    A Review on Human Orf: A Neglected Viral Zoono...
1093    Exploration of Genetic Variants within the Goa...
1094    Genome-Wide Identification of Reference Genes ...
1095    Hepatic Lipid Accumulation and Dysregulation A...
1096    Identification of differentially expressed gen...
Length: 1097, dtype: object

inference_batch

In [39]:
best_model = results_df['Model'][0]
best_pipeline = model_pipelines[best_model]

# Predict on the validation set
y_pred = best_pipeline.predict(inference_corpurs)
len(y_pred)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [40]:
inference_batch['Label'] = y_pred
inference_batch['Label'].value_counts()

Label
0    1056
1      41
Name: count, dtype: int64

In [41]:
inference_batch[['PMID', 'Label']]

Unnamed: 0,PMID,Label
0,34902587,0
1,35451025,0
2,34859764,0
3,35143972,0
4,34872491,0
...,...,...
1092,34267574,0
1093,34359218,0
1094,34827869,0
1095,35327163,0


In [42]:
inference_batch[['PMID', 'Label']].to_csv("solution_0.csv", index=False)