# **Classification Notebook (NovaSBE X GregoryAI)**

![Description of the image](../images/classify_pipeline_diagram.png)

## 1. Import libraries

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

# Add the parent directory of code_utils to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

# Suppress warnings
warnings.filterwarnings("ignore")

from code_utils.text_utils import *  # Import everything from text_utils.py
from code_utils.model_utils.LSTM_algorithm_utils import *  
from code_utils.model_utils.BERT_algorithm_utils import *  
from code_utils.model_utils.LGBM_algorithm_utils import *  
from code_utils.model_utils.classify_model_choose import *
from code_utils.download_utils import * 




## 2. Download articles

In [2]:
# load the previous data
old_articles_path = '..\\data\\articles_08-06-2024_14h13m04s.csv'
url = 'https://gregory-ms.com/developers/articles.zip'
download_and_extract_zip(url, old_articles_path, 'max')

DataFrame saved to data\2024-06-09\train_articles.csv
DataFrame saved to data\2024-06-09\inference_articles.csv


(                                                        title  \
 article_id                                                      
 1            COVID-19, HHV6 and MOG antibody: A perfect storm   
 2           Is Migraine Associated to Brain Anatomical Alt...   
 3           Patient Satisfaction With the Quality of Couns...   
 4           Rare Case of Spinal Neurosarcoidosis with Conc...   
 5           Evaluation of Urinary Tract Infection followin...   
 ...                                                       ...   
 25079       In Vitro and Ex Vivo Methodologies for T-Cell ...   
 25080       Correction to "Comparative adherence trajector...   
 25081       Comparing face-to-face and videoconference ass...   
 25232       The development of a core outcome set for stud...   
 25233       TDP-43-regulated cryptic RNAs accumulate in Al...   
 
                                                       summary  \
 article_id                                                      
 1      

## 2. Load articles, clean and pre-process articles

In [3]:
dataset_path = os.path.join('../data/2024-06-09', # choose the day folder intended to use 
                            'inference_articles.csv')

# additional step to ensure consistency in the index column formating as article_id
articles_df = pd.read_csv(dataset_path)

# if the first column is not article_id, remove that first column

if articles_df.columns[0] != 'article_id':
    articles_df = articles_df.drop(columns=articles_df.columns[0])

articles_clean_df = load_and_format_dataset(dataset_path, text_cleaning_pd_series)

articles_clean_df.head()

Unnamed: 0_level_0,text_processed,relevant
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1277953,tolllike receptor 10 downregulated serum patie...,unlabeled
229393,consequences delayed diagnosis treatment perso...,unlabeled
229394,mini review photoacoustic clinical imaging non...,unlabeled
229395,novel pyrazolopyridine inhibitors monoacylglyc...,1
229396,study compare efficacy costeffectiveness vario...,unlabeled


In [4]:
articles_clean_df.relevant.value_counts()

relevant
unlabeled    14792
0             1114
1              734
Name: count, dtype: int64

**Filter to keep Unlabelled Data**

In [5]:
unlabeled_articles = articles_clean_df[articles_clean_df['relevant'] == 'unlabeled']
unlabeled_texts = unlabeled_articles['text_processed']
article_ids = unlabeled_articles.index

## 4. Classify Articles (predict Labels for Unlabeled Articles)

In [6]:
model_path = '../models/lgbm_classifier.joblib'
vectorizer_path = '../models/tfidf_vectorizer.joblib'

# List of available models
models_available = ['LSTM_Classifier', 'BERT_Classifier', 'LGBM_TFIDF_Classifier']

# Choose the model
model_name = models_available[2]  # This can be set dynamically by the user

# Predict labels for the unlabeled articles
predicted_labels = predict_with_model(model_name, model_path, vectorizer_path, unlabeled_texts)


Model loaded: vectorizer from ../models/tfidf_vectorizer.joblib, classifier from ../models/lgbm_classifier.joblib


NotFittedError: idf vector is not fitted

## 5. Store the results

In [None]:
# Create a new DataFrame with article IDs and predicted labels
results_df = pd.DataFrame({
    'article_id': article_ids,
    f'{model_name}_pred': predicted_labels
})

# Reset the index to ensure article_id is a column and not an index
results_df.reset_index(drop=True, inplace=True)

In [None]:
# Join the results with the original DataFrame

df = pd.read_csv('../articles_downloads/articles_08-06-2024_14h13m04s.csv')

new_articles_df = df.merge(results_df, how='left', left_on='article_id', right_on='article_id')

new_articles_df.head()

Unnamed: 0.1,Unnamed: 0,article_id,title,summary,link,published_date,discovery_date,source,publisher,container_title,authors,relevant,doi,access,takeaways,categories,LGBM_TFIDF_Classifier_pred
0,0,1,"COVID-19, HHV6 and MOG antibody: A perfect storm",J Neuroimmunol. 2021 Feb 12;353:577521. doi: 1...,https://pubmed.ncbi.nlm.nih.gov/33607505/?fc=2...,2021-04-14,2021-02-23,8.0,Elsevier BV,Journal of Neuroimmunology,"Ali Fadhil, Ankita Prasad, Anthony Zampino, Fa...",False,10.1016/j.jneuroim.2021.577521,open,First case of HHV6 reactivation in central ne...,,
1,1,2,Is Migraine Associated to Brain Anatomical Alt...,Brain Topogr. 2021 Feb 19. doi: 10.1007/s10548...,https://pubmed.ncbi.nlm.nih.gov/33606142/?fc=2...,2021-01-05,2021-02-23,8.0,Springer Science and Business Media LLC,Brain Topography,"Anne Caclin, Aurélie Bidet-Caulet, David Meuni...",False,10.1007/s10548-021-00824-6,open,Growing number of studies investigate brain a...,,
2,2,3,Patient Satisfaction With the Quality of Couns...,J Neurosci Nurs. 2021 Feb 17. doi: 10.1097/JNN...,https://pubmed.ncbi.nlm.nih.gov/33605649/?fc=2...,2021-03-31,2021-02-23,8.0,Ovid Technologies (Wolters Kluwer Health),Journal of Neuroscience Nursing,"Daniela Händler-Schuster, Diana Zanolari, Gabr...",False,10.1097/JNN.0000000000000578,restricted,The challenges in dealing with multiple scler...,,
3,3,4,Rare Case of Spinal Neurosarcoidosis with Conc...,Case Rep Neurol Med. 2021 Jan 28;2021:5952724....,https://pubmed.ncbi.nlm.nih.gov/33604089/?fc=2...,2021-01-28,2021-02-23,8.0,Hindawi Limited,Case Reports in Neurological Medicine,"Achraf Makki, Maria Khoueiry, Nesreen Jaafar, ...",,10.1155/2021/5952724,open,Spinal neurosarcoidosis is a rare disease tha...,,0.0
4,4,5,Evaluation of Urinary Tract Infection followin...,Can J Infect Dis Med Microbiol. 2021 Jan 31;20...,https://pubmed.ncbi.nlm.nih.gov/33603936/?fc=2...,2021-01-31,2021-02-23,8.0,Hindawi Limited,Canadian Journal of Infectious Diseases and Me...,"Aliyeh Bazi, Monireh Ghazaeian, Narjes Hendoie...",False,10.1155/2021/6616763,open,Double-blind randomized clinical trial was co...,,


In [None]:
# Export the new DataFrame to a CSV file into the folder articles_classification,
# with name formating as follows: articles_predictions_{model_name}_{current_date}_{current_time}.csv

def get_current_date_time():
    now = datetime.now()
    current_date = now.strftime('%Y-%m-%d')
    current_time = now.strftime('%H-%M-%S')
    return current_date, current_time

current_date = get_current_date_time()[0]
current_time = get_current_date_time()[1]

output_filename = f'articles_predictions_{model_name}_{current_date}_{current_time}.csv'

output_path = os.path.join('../articles_classification', output_filename)

new_articles_df.to_csv(output_path, index=False)