In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import re
from sklearn.feature_extraction.text import CountVectorizer
import collections
# from google.colab import drive
from random import choice

# **Part 1 : data preprocessing**

In [6]:
df = pd.read_csv('./Dataset/medical_dataset/train.csv')

In [7]:
def Data_processing (data):
    for i in range(len(data)):
        data.iloc[i, 1] = re.sub(r'[^\w\s]', ' ', data.iloc[i, 1])
#         data.iloc[i, 1] = re.sub(r'[0-9]', '', data.iloc[i, 1])
        data.iloc[i, 1] = data.iloc[i, 1].lower()
    return data

In [8]:
data = Data_processing(df)

In [10]:
def word_frequency (Data):
    Word_frequency = []
    for i in range(len(Data)):
        frequency = Data.iloc[i, 1].split()
        Word_frequency += frequency
    top_frequency = Word_frequency
    occurrences = collections.Counter(top_frequency)
    W,f = zip(*occurrences.most_common(10000))
    return W,f

In [11]:
word, occurance = word_frequency(data)

In [12]:
Vocabulary = list(zip(word,range(1,10001),occurance))
pd.DataFrame(Vocabulary).to_csv('./Dataset/medical_dataset/medical_text-vocab.txt',sep='\t',header=None,index=False)

In [13]:
def dataset (dfram,word):
    for i in range(len(dfram)):
        frequency = dfram.iloc[i, 1].split()
        for j in range(len(frequency)):
            try:
                frequency[j] = word.index(frequency[j]) + 1
            except ValueError:
                frequency[j] = ''
        dfram.iloc[i, 1] = ' '.join([str(k) for k in frequency])
        dfram.iloc[i,1] = re.sub(r'[^a-zA-Z0-9]', ' ', dfram.iloc[i,1])
    return dfram      

In [15]:
Train = dataset(data,word)

In [17]:
Train.to_csv('./Dataset/medical_dataset/medical_text-train.txt',sep='\t',index=False)

In [18]:
df_ = pd.read_csv('./Dataset/medical_dataset/medical_text-train.txt',sep='\t')

In [20]:
df_V = pd.read_csv('./Dataset/medical_dataset/valid.csv')
df_T = pd.read_csv('./Dataset/medical_dataset/test.csv')

In [21]:
# preprocessing Test and Validation data
data_V = Data_processing(df_V)
data_T = Data_processing(df_T)

In [22]:
valid = dataset(data_V,word)
test = dataset(data_T,word)

In [23]:
valid.to_csv('./Dataset/medical_dataset/medical_text-valid.txt',sep='\t',index=False)
test.to_csv('./Dataset/medical_dataset/medical_text-test.txt',sep='\t',index=False)

In [7]:
def bag_of_word(data_text):
  mat = np.zeros((len(data_text),10000))
  for i in range(len(data_text)):
    freq = data_text.iloc[i, 1].split()
    for j in range(len(freq)):
      mat[i][int(freq[j])-1] = 1
  return mat

In [25]:
df_text_train = pd.read_csv('./Dataset/medical_dataset/medical_text-train.txt',sep='\t')
df_text_val = pd.read_csv('./Dataset/medical_dataset/medical_text-valid.txt',sep='\t')
df_text_test = pd.read_csv('./Dataset/medical_dataset/medical_text-test.txt',sep='\t')

# **Part 2: binary bag-of-words (BBoW)**

In [29]:
trainnig_data = bag_of_word(df_text_train)
validation_data = bag_of_word(df_text_val)
test_data = bag_of_word(df_text_test)

In [30]:
trainnig_data.shape

(4000, 10000)

# **2.a**

In [31]:
def random_class(review):
  class_array = np.zeros(review.shape)
  sequence = [1,2,3,4]
  for i in range(len(class_array)):
    class_array[i] =  choice(sequence)
  return class_array

In [32]:
print("Random classifier F1-score : ", f1_score(list(df_text_test.iloc[:,0]),list(map(int, random_class(df_text_test.iloc[:,0]))), average='weighted' ))

Random classifier F1-score :  0.25428007921921786


In [33]:
def majority_class(review):
  class_array = np.zeros(review.shape)
  major = collections.Counter(review)
  array = np.full(len(review), major.most_common(1)[0][0])
  return array

In [34]:
print("Majority classifier F1-score : ", f1_score(list(df_text_test.iloc[:,0]),list(majority_class(df_text_test.iloc[:,0])), average='weighted' ))

Majority classifier F1-score :  0.22466475644699144


# **2.b**

In [35]:
All_data_X = np.vstack((trainnig_data,validation_data))
All_data_X.shape

(4499, 10000)

In [36]:
split_index = [-1 for _ in range(trainnig_data.shape[0])] + [0 for _ in range(validation_data.shape[0])]

In [37]:
All_data_Y = np.concatenate((df_.iloc[:,0].values, df_V.iloc[:,0].values))

In [38]:
def grid_search (model, parameter, x_train, y_train, ps ):
  grid = GridSearchCV(model, parameter, cv= ps, scoring = 'f1_macro',return_train_score=True)
  grid.fit(x_train, y_train)
  return grid

In [39]:
parameter_NB = {
    'alpha': np.logspace(-4,0,20)
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(BernoulliNB(), parameter_NB, All_data_X, All_data_Y, ps )

In [40]:
np.logspace(-4,1,20)

array([1.00000000e-04, 1.83298071e-04, 3.35981829e-04, 6.15848211e-04,
       1.12883789e-03, 2.06913808e-03, 3.79269019e-03, 6.95192796e-03,
       1.27427499e-02, 2.33572147e-02, 4.28133240e-02, 7.84759970e-02,
       1.43844989e-01, 2.63665090e-01, 4.83293024e-01, 8.85866790e-01,
       1.62377674e+00, 2.97635144e+00, 5.45559478e+00, 1.00000000e+01])

In [42]:
print('Best Parameter for Bernoulli naive bayes : ', bnb_bbof.best_params_) 

Best Parameter for Bernoulli naive bayes :  {'alpha': 1.0}


In [43]:
y_pred = bnb_bbof.predict(trainnig_data)
y_pred_val = bnb_bbof.predict(validation_data)
y_pred_test = bnb_bbof.predict(test_data)

In [44]:
print("Training f1-score")
print("Bernoulli naive bayes training f1-score for best alpha : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Bernoulli naive bayes training f1-score for best alpha :  0.5317516944078564


In [45]:
print("Validation f1-score")
print("Bernoulli naive bayes validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for alpha = ", bnb_bbof.best_params_)

Validation f1-score
Bernoulli naive bayes validation f1-score :  0.5005554812213722 for alpha =  {'alpha': 1.0}


In [46]:
print("Test f1-score")
print("Bernoulli naive bayes test f1-score for best alpha : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Bernoulli naive bayes test f1-score for best alpha :  0.46048206194712504


In [47]:
parameter_NB = {
    'C': np.logspace(-4,0,20),
    'multi_class': ['ovr', 'multinomial'],
    'solver': ['newton-cg', 'lbfgs']
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(LogisticRegression(penalty='l2',max_iter=1000, dual=False), parameter_NB, All_data_X, All_data_Y, ps )

In [48]:
print('Best Parameter for Logistic regression : ', bnb_bbof.best_params_) 

Best Parameter for Logistic regression :  {'C': 1.0, 'multi_class': 'ovr', 'solver': 'newton-cg'}


In [49]:
y_pred = bnb_bbof.predict(trainnig_data)
y_pred_val = bnb_bbof.predict(validation_data)
y_pred_test = bnb_bbof.predict(test_data)

In [50]:
print("Training f1-score")
print("Logistic regression training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Logistic regression training f1-score for best parameter :  0.8982482873187748


In [51]:
print("Validation f1-score")
print("Logistic regression validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Logistic regression validation f1-score :  0.8951881976476632 for parameter =  {'C': 1.0, 'multi_class': 'ovr', 'solver': 'newton-cg'}


In [52]:
print("Test f1-score")
print("Logistic regresssion test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Logistic regresssion test f1-score for best parameter :  0.7453697420582956


In [53]:
parameter_NB = {
    'C': np.logspace(-4,0,20),
    'multi_class': ['ovr', 'crammer_singer']
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(LinearSVC(penalty='l2', loss='hinge', max_iter=100000), parameter_NB, All_data_X, All_data_Y, ps )



In [54]:
print('Best Parameter for support vector machine : ', bnb_bbof.best_params_) 

Best Parameter for support vector machine :  {'C': 0.3792690190732246, 'multi_class': 'ovr'}


In [55]:
y_pred = bnb_bbof.predict(trainnig_data)
y_pred_val = bnb_bbof.predict(validation_data)
y_pred_test = bnb_bbof.predict(test_data)

In [56]:
print("Training f1-score")
print("Support vector machine training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Support vector machine training f1-score for best parameter :  0.8955751762310216


In [57]:
print("Validation f1-score")
print("Support vector machine validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Support vector machine validation f1-score :  0.8969544976286412 for parameter =  {'C': 0.3792690190732246, 'multi_class': 'ovr'}


In [58]:
print("Test f1-score")
print("Support vector machine test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Support vector machine test f1-score for best parameter :  0.7876321083484212


In [59]:
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(trainnig_data, df_.iloc[:,0].values)
ccp_alphas = path.ccp_alphas

In [60]:
parameter_NB = {
    'criterion' : ["gini", "entropy"],
    'ccp_alpha':ccp_alphas
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(DecisionTreeClassifier(), parameter_NB, All_data_X, All_data_Y, ps )

ValueError: ccp_alpha must be greater than or equal to 0



In [61]:
print('Best Parameter for decision tree : ', bnb_bbof.best_params_) 

Best Parameter for decision tree :  {'ccp_alpha': 0.0010301801801801801, 'criterion': 'gini'}


In [62]:
y_pred = bnb_bbof.predict(trainnig_data)
y_pred_val = bnb_bbof.predict(validation_data)
y_pred_test = bnb_bbof.predict(test_data)

In [63]:
print("Training f1-score")
print("Decission tree training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Decission tree training f1-score for best parameter :  0.8384904413311942


In [64]:
print("Validation f1-score")
print("Decission tree validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Decission tree validation f1-score :  0.8180510230908237 for parameter =  {'ccp_alpha': 0.0010301801801801801, 'criterion': 'gini'}


In [65]:
print("Test f1-score")
print("Decision tree test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Decision tree test f1-score for best parameter :  0.8197983870967742


# **Part 3 : frequency bag-of-words (FBoW)**

In [66]:
def frequency_bag_of_word(data_text):
  matf = np.zeros((len(data_text),10000))
  for i in range(len(data_text)):
    freqf = data_text.iloc[i, 1].split()
    for j in range(len(freqf)):
      matf[i][int(freqf[j])-1] = matf[i][int(freqf[j])-1] + 1/len(freqf)
  return matf

In [67]:
All_data_Y = np.concatenate((df_.iloc[:,0].values, df_V.iloc[:,0].values))

In [68]:
trainnig_data_f = frequency_bag_of_word(df_text_train)
validation_data_f = frequency_bag_of_word(df_text_val)
test_data_f = frequency_bag_of_word(df_text_test)

In [69]:
np.sum(trainnig_data_f, axis=1)   

array([1., 1., 1., ..., 1., 1., 1.])

In [70]:
split_index = [-1 for _ in range(trainnig_data_f.shape[0])] + [0 for _ in range(validation_data_f.shape[0])]

In [71]:
All_data_X = np.vstack((trainnig_data_f,validation_data_f))
All_data_X.shape

(4499, 10000)

In [72]:
parameter_NB = {
    'var_smoothing': np.logspace(-4,0,20)
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(GaussianNB(), parameter_NB, All_data_X, All_data_Y, ps )

In [73]:
print('Best Parameter for Gaussian naive bayes : ', bnb_bbof.best_params_) 

Best Parameter for Gaussian naive bayes :  {'var_smoothing': 0.0011288378916846883}


In [74]:
y_pred = bnb_bbof.predict(trainnig_data_f)
y_pred_val = bnb_bbof.predict(validation_data_f)
y_pred_test = bnb_bbof.predict(test_data_f)

In [75]:
print("Training f1-score")
print("Gaussian naive bayes training f1-score for best alpha : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Gaussian naive bayes training f1-score for best alpha :  0.5758868567991946


In [76]:
print("Validation f1-score")
print("Gaussian naive bayes validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for smoothing = ", bnb_bbof.best_params_)

Validation f1-score
Gaussian naive bayes validation f1-score :  0.54820646001428 for smoothing =  {'var_smoothing': 0.0011288378916846883}


In [77]:
print("Test f1-score")
print("Gaussian naive bayes test f1-score for best alpha : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Gaussian naive bayes test f1-score for best alpha :  0.45365751346354793


In [78]:
parameter_NB = {
    'C': np.logspace(-4,0,20),
    'multi_class': ['ovr', 'multinomial'],
    'solver': ['newton-cg', 'lbfgs']
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(LogisticRegression(penalty='l2',max_iter=1000, dual=False), parameter_NB, All_data_X, All_data_Y, ps )

In [79]:
print('Best Parameter for Logistic regression : ', bnb_bbof.best_params_) 

Best Parameter for Logistic regression :  {'C': 1.0, 'multi_class': 'multinomial', 'solver': 'newton-cg'}


In [80]:
y_pred = bnb_bbof.predict(trainnig_data_f)
y_pred_val = bnb_bbof.predict(validation_data_f)
y_pred_test = bnb_bbof.predict(test_data_f)

In [81]:
print("Training f1-score")
print("Logistic regression training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Logistic regression training f1-score for best parameter :  0.33226822271198364


In [82]:
print("Validation f1-score")
print("Logistic regression validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Logistic regression validation f1-score :  0.3315731724462538 for parameter =  {'C': 1.0, 'multi_class': 'multinomial', 'solver': 'newton-cg'}


In [83]:
print("Test f1-score")
print("Logistic regresssion test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Logistic regresssion test f1-score for best parameter :  0.33142365453263906


In [84]:
parameter_NB = {
    'C': np.logspace(-4,0,20),
    'multi_class': ['ovr', 'crammer_singer']
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(LinearSVC(penalty='l2', loss='hinge', max_iter=100000), parameter_NB, All_data_X, All_data_Y, ps )



In [85]:
print('Best Parameter for support vector machine : ', bnb_bbof.best_params_) 

Best Parameter for support vector machine :  {'C': 0.00026366508987303583, 'multi_class': 'crammer_singer'}


In [86]:
y_pred = bnb_bbof.predict(trainnig_data_f)
y_pred_val = bnb_bbof.predict(validation_data_f)
y_pred_test = bnb_bbof.predict(test_data_f)

In [87]:
print("Training f1-score")
print("Support vector machine training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Support vector machine training f1-score for best parameter :  0.43502660424123496


In [88]:
print("Validation f1-score")
print("Support vector machine validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Support vector machine validation f1-score :  0.42892538137912545 for parameter =  {'C': 0.00026366508987303583, 'multi_class': 'crammer_singer'}


In [89]:
print("Test f1-score")
print("Support vector machine test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Support vector machine test f1-score for best parameter :  0.3651041219590659


In [90]:
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(trainnig_data_f, df_.iloc[:,0].values)
ccp_alphas = path.ccp_alphas

In [91]:
parameter_NB = {
    'criterion' : ["gini", "entropy"],
    'ccp_alpha':ccp_alphas
}
ps = PredefinedSplit(test_fold=split_index)
bnb_bbof = grid_search(DecisionTreeClassifier(), parameter_NB, All_data_X, All_data_Y, ps )

ValueError: ccp_alpha must be greater than or equal to 0



In [92]:
print('Best Parameter for decision tree : ', bnb_bbof.best_params_) 

Best Parameter for decision tree :  {'ccp_alpha': 0.0036966786811697405, 'criterion': 'entropy'}


In [93]:
y_pred = bnb_bbof.predict(trainnig_data_f)
y_pred_val = bnb_bbof.predict(validation_data_f)
y_pred_test = bnb_bbof.predict(test_data_f)

In [94]:
print("Training f1-score")
print("Decission tree training f1-score for best parameter : ", f1_score(df_.iloc[:,0].values,y_pred, average='macro'))

Training f1-score
Decission tree training f1-score for best parameter :  0.8163593919317851


In [95]:
print("Validation f1-score")
print("Decission tree validation f1-score : ", f1_score(df_V.iloc[:,0].values,y_pred_val, average='macro'), "for parameter = ", bnb_bbof.best_params_)

Validation f1-score
Decission tree validation f1-score :  0.8219494384333732 for parameter =  {'ccp_alpha': 0.0036966786811697405, 'criterion': 'entropy'}


In [96]:
print("Test f1-score")
print("Decision tree test f1-score for best parameter : ", f1_score(df_T.iloc[:,0].values,y_pred_test, average='macro'))

Test f1-score
Decision tree test f1-score for best parameter :  0.770379286325175
