In [1]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv('../data/review_references_title_abstracts_sample_train.csv')
test_df = pd.read_csv('../data/review_references_title_abstracts_sample_test.csv')

In [3]:
TARGET_COL_NAME = 'review'

In [4]:
tfidf_transformer = TfidfVectorizer(
    ngram_range=(1, 2),
    analyzer='word',
    lowercase=True,
    max_features=50000,
    stop_words='english'
)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [7]:
INPUT_COL_NAMES = 'title_abstract'


logreg = LogisticRegression(
    C=1,
    random_state=42,
    solver='sag',
    n_jobs=-1,
    max_iter=500
)

model = Pipeline([
    ('tfidf', tfidf_transformer), 
    ('logit', logreg)
])


acc_scores, prec_scores, cv_f1_scores, recall_scores = [], [], [], []
skf_split_generator = skf.split(X=train_df[INPUT_COL_NAMES], y=train_df[TARGET_COL_NAME])

for fold_id, (train_idx, val_idx) in tqdm(enumerate(skf_split_generator)):
    curr_train_df = train_df.iloc[train_idx]
    curr_val_df = train_df.iloc[val_idx]
    
    model.fit(X=curr_train_df[INPUT_COL_NAMES], y=curr_train_df[TARGET_COL_NAME])
    
    # making predictions for the current validation set
    curr_preds = model.predict(X=curr_val_df[INPUT_COL_NAMES])
    curr_f1 = f1_score(y_true=curr_val_df[TARGET_COL_NAME], y_pred=curr_preds)
    curr_acc = accuracy_score(y_true=curr_val_df[TARGET_COL_NAME], y_pred=curr_preds)
    curr_prec = precision_score(y_true=curr_val_df[TARGET_COL_NAME], y_pred=curr_preds)
    curr_recall = recall_score(y_true=curr_val_df[TARGET_COL_NAME], y_pred=curr_preds)
    cv_f1_scores.append(curr_f1)
    acc_scores.append(curr_acc)
    prec_scores.append(curr_prec)
    recall_scores.append(curr_recall)
    print(f"F1-score for fold {fold_id} is {curr_f1:.3}. Accuracy is {curr_acc:.3}. Precision is {curr_prec:.3}. Recall is {curr_recall:.3}.")

print(f'Average cross-validation F1-score is {np.mean(cv_f1_scores):.3} +/- {np.std(cv_f1_scores):.3}.')
print(f'Average cross-validation ACC is {np.mean(acc_scores):.3} +/- {np.std(acc_scores):.3}.')
print(f'Average cross-validation Prec is {np.mean(prec_scores):.3} +/- {np.std(prec_scores):.3}.')
print(f'Average cross-validation Recall is {np.mean(recall_scores):.3} +/- {np.std(recall_scores):.3}.')

0it [00:00, ?it/s]

F1-score for fold 0 is 0.535. Accuracy is 0.934. Precision is 0.796. Recall is 0.403.
F1-score for fold 1 is 0.531. Accuracy is 0.934. Precision is 0.786. Recall is 0.401.
F1-score for fold 2 is 0.548. Accuracy is 0.936. Precision is 0.801. Recall is 0.416.
F1-score for fold 3 is 0.511. Accuracy is 0.932. Precision is 0.794. Recall is 0.377.
F1-score for fold 4 is 0.532. Accuracy is 0.935. Precision is 0.81. Recall is 0.397.
Average cross-validation F1-score is 0.531 +/- 0.0118.
Average cross-validation ACC is 0.934 +/- 0.00107.
Average cross-validation Prec is 0.797 +/- 0.00795.
Average cross-validation Recall is 0.399 +/- 0.0127.


In [10]:
top_words = np.argsort(model['logit'].coef_[0])[-25:]
for word in top_words:
    print(model['tfidf'].get_feature_names_out()[word], model['logit'].coef_[0][word])

functions 2.9517344231308966
summarize 2.984254751189903
aspects 2.984634518749673
evidence 3.0006540352419746
immune 3.005570438511807
disease 3.0300333831990662
diseases 3.1441516855159923
systematic review 3.220770778216716
approaches 3.2387669412087177
advances 3.2497165379878896
current 3.3215531085071923
discussed 3.327408713503976
strategies 3.339450938883755
include 3.4938949571571474
understanding 3.572797879458418
reviewed 3.6724603829502254
meta analysis 3.835393556796889
meta 3.901249451748954
discuss 4.255185871290517
overview 4.509952398215733
mechanisms 4.653234908245823
literature 4.8777822000295386
recent 6.327817179756542
studies 7.686248215812307
review 19.163369291319626


In [8]:
top_words = np.argsort(model['logit'].coef_[0])[:25]
for word in top_words:
    print(model['tfidf'].get_feature_names_out()[word], model['logit'].coef_[0][word])

study -7.819562866174619
results -4.196277749601822
using -4.057720331064499
showed -3.350426224821404
report -2.998919486271501
significantly -2.9925083205652823
expression -2.974439968361307
mice -2.9523398167549786
method -2.7361959453876588
year -2.72302414128119
observed -2.6169859836090588
demonstrate -2.5486911189558907
model -2.4242500830964677
induced -2.374080912986398
treated -2.355052096509867
effect -2.275316545333808
did -2.2237861120419935
10 -2.16670860000283
rats -2.1518167833534143
rat -2.1429376726933156
participants -2.142503115704531
levels -1.964023719882493
investigated -1.9289137075962708
al -1.908257957636246
previously -1.8930140933282456


In [9]:
curr_preds = model.predict(X=test_df[INPUT_COL_NAMES])
curr_f1 = f1_score(y_true=test_df[TARGET_COL_NAME], y_pred=curr_preds)
curr_acc = accuracy_score(y_true=test_df[TARGET_COL_NAME], y_pred=curr_preds)
curr_prec = precision_score(y_true=test_df[TARGET_COL_NAME], y_pred=curr_preds)
curr_recall = recall_score(y_true=test_df[TARGET_COL_NAME], y_pred=curr_preds)
print(f"F1-score for is {curr_f1:.3}. Accuracy is {curr_acc:.3}. Precision is {curr_prec:.3}. Recall is {curr_recall:.3}.")

F1-score for is 0.589. Accuracy is 0.941. Precision is 0.853. Recall is 0.45.
