<a href="https://colab.research.google.com/github/christinezuzart/DeepLearning/blob/master/DrugReviewConditionULMFitFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount drive inorder to read train and test csv files

In [0]:
from google.colab import drive 
drive.mount('/content/gdrive')

Populate train and test data in dataframes

In [0]:
import pandas as pd
import io

dfTrain = pd.read_csv('/content/gdrive/My Drive/Data/drugsComTrain_raw.csv')
dfTest = pd.read_csv('/content/gdrive/My Drive/Data/drugsComTest_raw.csv') 


Create train and test dataframes with only two columns - labels and text

In [0]:
train_texts = dfTrain.loc[ : , 'review'].values
train_labels = dfTrain.loc[ : , 'condition'].values

test_texts = dfTest.loc[ : , 'review'].values
test_labels = dfTest.loc[ : , 'condition'].values

col_names = ['labels','text']
df_train = pd.DataFrame({'text':train_texts, 'labels':train_labels}, columns=col_names)
df_test = pd.DataFrame({'text':test_texts, 'labels':test_labels}, columns=col_names)

print("Train shape :" ,df_train.shape)
print("Test shape :", df_test.shape)

Remove examples with blank condition 

In [0]:
import html

df_train['text'] = df_train['text'].apply(html.unescape)
df_test['text'] = df_test['text'].apply(html.unescape)


In [0]:
df_train = df_train.dropna(axis=0)
df_test = df_test.dropna(axis=0)

print("Train shape :", df_train.shape)
print("Test shape :", df_test.shape)

Process df_test to remove examples that have conditions not present in df_train.Store it in df_test_new

In [0]:
unique_train_labels_list = df_train.labels.unique()
unique_test_labels_list = df_test.labels.unique()

df_test_new = pd.DataFrame(columns=['labels','text'])

for ind in df_test.index: 
  if df_test['labels'][ind] in unique_train_labels_list :
   new_row = {'labels': df_test['labels'][ind], 'text':df_test['text'][ind]}
   df_test_new = df_test_new.append(new_row, ignore_index=True)

print("Test shape :", df_test_new.shape)

Check the number of examples corresponding to a particular label in train and test dataframes

In [0]:
df_train['labels'].value_counts()
df_test_new['labels'].value_counts()

Create a language model specific data bunch

In [0]:
from fastai.text import *

# Language model data
data_lm = TextLMDataBunch.from_df('./', train_df=df_train, valid_df=df_test_new)

Check how the data is encoded by fast.ai

In [0]:
data_lm.show_batch()

Create language model learner

In [0]:
# Language model
lang_learner = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.1)

Search through a range of learning rates to find the optimum one for our dataset

In [0]:
lang_learner.lr_find(start_lr=1e-8, end_lr=1e2)
lang_learner.recorder.plot()

Fine tune the language model

In [0]:
lang_learner = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
lang_learner.fit_one_cycle(1, 1e-3, moms=(0.8, 0.7))

lang_learner.unfreeze() 
lang_learner.fit_one_cycle(2, 1e-3, moms=(0.8, 0.7))

Save the language model

In [0]:
lang_learner.save_encoder('fine_enc')
lmmodel_save_name = 'lmexport.pkl'
path = F"/content/gdrive/My Drive/{lmmodel_save_name}" 
print(path)
lang_learner.export(path)


Create the classifier data bunch

In [0]:
# Classifier model data
data_clas = TextClasDataBunch.from_df('./', train_df=df_train, valid_df=df_test_new, vocab=data_lm.train_ds.vocab, bs=32)
data_clas.save('tmp_clas')

In [0]:
# Classifier
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.7)
learn.load_encoder('fine_enc')
learn.freeze()

Search through a range of learning rates to find the optimum one for our dataset

In [0]:
learn.lr_find()
learn.recorder.plot()

Fine tune the classifier

In [0]:
learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))

In [0]:
learn.save('first')
learn.load('first');
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

In [0]:
learn.save('second')
learn.load('second');
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

In [0]:
learn.save('third')
learn.load('third');
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

Export the classifier trained model

In [0]:
model_save_name = 'export2.pkl'
path = F"/content/gdrive/My Drive/{model_save_name}" 
print(path)
learn.export(path)

learn.save("trained_model2", return_path=True)

Get Predictions

In [0]:
# get predictions
preds, targets = learn.get_preds()
predictions = np.argmax(preds, axis=1)
pd.crosstab(predictions, targets)

print(f"softmax predictions: {preds}")
print(f"indexed predictions:{predictions}")
print(f"targets:{targets}")

Print class names

In [0]:
class_names = learn.data.classes
print(class_names)

Confusion Matrix

In [0]:
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(targets, predictions)
print(confusion_matrix)


Verify predictions

In [0]:
filename = 'Confusion2.csv'
path = F"/content/gdrive/My Drive/{filename}" 
pd.DataFrame(confusion_matrix).to_csv(path)

In [0]:
learn.show_results(rows=5)

In [0]:
diagonal = np.diag(confusion_matrix)
true_positives = sum(diagonal)
total_test_data = np.sum(confusion_matrix, axis=1).sum()
accuracy = (true_positives /total_test_data) * 100

print("Test Accuracy Percentage:", accuracy)