In [4]:
import pandas as PD
import numpy as NP
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC

import preprof      #my functions

In [5]:
# import papers
csv = '/Users/cdw/Desktop/pkpd_script/MNCA_ANALYSIS/ready_processed_2.csv'
papers=PD.read_csv(csv)

# select only relevant papers
relevant = ['Non-compartmental','Modelling']
relevant_papers = papers.loc[papers['category'].isin(relevant)]
relevant_papers.shape

(682, 3)

In [6]:
num_nca=relevant_papers.loc[papers['category']=='Non-compartmental'].shape[0]
num_mod=relevant_papers.loc[papers['category']=='Modelling'].shape[0]
print('#NCA: {}'.format(num_nca))
print('#Modelling: {}'.format(num_mod))

#NCA: 239
#Modelling: 443


In [7]:
rd_seed = 61097
x = [a.split('!$!') for a in relevant_papers.loc[:,'words']]
y = relevant_papers.loc[:,'category']

def fake_tokeniser(text):
    return text

tf_idf = TfidfVectorizer(analyzer='word',tokenizer=fake_tokeniser,preprocessor=fake_tokeniser,lowercase=False,token_pattern=None)

tfidf = tf_idf.fit_transform(x)
tfidf = PD.DataFrame(tfidf.toarray())
tfidf_names = tf_idf.get_feature_names()

x_train,x_test,y_train,y_test=train_test_split(tfidf,y,test_size=0.15,random_state=rd_seed,stratify=y)


In [8]:
model=LinearSVC(random_state=61097,max_iter=10000).fit(x_train,y_train)

In [9]:
impts = abs(model.coef_[0])
my_dt = PD.DataFrame(impts)
importances = [i[0] for i in my_dt.values.tolist()]

In [10]:
tr_ind = []

def find(s, c1):
    return [i for i, c2 in enumerate(s) if c2 == c1]


for x in importances:
    if x >= 0.33:
        inds=find(importances,x)
        for i in inds:
            tr_ind.append(i)
tr_ind=list(set(tr_ind))

In [11]:
print(len(tr_ind))

320


In [12]:
x_train= x_train.iloc[:,tr_ind]
x_test = x_test.iloc[:,tr_ind]

print(x_train.shape,x_test.shape)

(579, 320) (103, 320)


In [13]:
original_model=LinearSVC(random_state=61097,max_iter=10000).fit(x_train,y_train)
original_pred=original_model.predict(x_test)
original_acc=accuracy_score(y_test,original_pred)
original_f1=f1_score(y_test,original_pred,pos_label='Non-compartmental')

print('Before hyperparam tuning...')
print(f'acc: {round(original_acc,5)}    f1:{round(original_f1,5)}')

Before hyperparam tuning...
acc: 0.87379    f1:0.82192


<br><br><br><br>

# hyperparameter tuning: linear svc tfidf

In [14]:
model_=LinearSVC(random_state=61097,max_iter=10000).fit(x_train,y_train)
Cs = [0.001, 0.01, 0.1, 1, 10]
param_grid={'C':Cs}

In [15]:
grid_search = GridSearchCV(model_, param_grid, cv=5)
grid_result=grid_search.fit(x_train, y_train)

print('Best Score:', round(grid_result.best_score_*100,2))
print('Best Params:', grid_result.best_params_)

Best Score: 92.4
Best Params: {'C': 10}


In [16]:
testmod=LinearSVC(max_iter=10000,random_state=61097,C=10).fit(x_train,y_train)
y_pred=testmod.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,pos_label='Non-compartmental'))

0.8640776699029126
0.8157894736842106


<br><br><br><br>

### any tuning makes the model worse...

so we will stick to the default model.

<br><br><br><br>

In [17]:
print('so in the end...')
print(f'acc: {round(original_acc,5)}    f1:{round(original_f1,5)}')

so in the end...
acc: 0.87379    f1:0.82192
