#### Dataset Link: https://archive.ics.uci.edu/dataset/462/drug+review+dataset+drugs+com

### Importing Libraries

In [3]:
import pandas as pd # for data preprocessing
import itertools # for confusion matrix
import string
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import xgboost
import joblib
import os
%matplotlib inline

# If you want to show all the rows of pandas dataframe
# pd.set_option('display.max_rows', None)

ModuleNotFoundError: No module named 'xgboost'

In [10]:
data1 = pd.read_csv('./drug review dataset drugs.com/drugsComTrain_raw.tsv',sep='\t')
data1.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


#### Variable Information
1. drugName (categorical): name of drug
2. condition (categorical): name of condition
3. review (text): patient review
4. rating (numerical): 10 star patient rating
5. date (date): date of review entry
6. usefulCount (numerical): number of users who found review useful

In [11]:
data2 = pd.read_csv('./drug review dataset drugs.com/drugsComTest_raw.tsv',sep='\t')
data2.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [None]:
data1.info(), data2.info()

In [None]:
data1.shape,data2.shape

In [None]:
data = pd.concat([data1,data2],axis=0)
data.duplicated().sum()

In [None]:
data=shuffle(data,random_state=0)
data.shape

In [None]:
data.head()

In [None]:
data.to_csv("./drug review dataset drugs.com/DrugsComPatient_raw.csv",index=False)

In [None]:
main_data = pd.read_csv('./drug review dataset drugs.com/DrugsComPatient_raw.csv')
main_data.head()

In [None]:
main_data['condition'].value_counts()

In [None]:
x = main_data[['condition','review']]
x

## Exploratory Data Analysis

In [None]:
x['condition'].unique().shape

In [None]:
x['condition'].value_counts()>=1000

In [None]:
condition_counts = x["condition"].value_counts()
condition_counts

In [None]:
condition_counts.head(5).index

In [None]:
valid_condition = condition_counts[condition_counts>=4000].index
len(valid_condition),valid_condition

In [None]:
x = x[x['condition'].isin(valid_condition)]
x

In [None]:
x['condition'].value_counts()

In [None]:
cond_5_ind=x['condition'].value_counts()[:5].index
cond_5_val=x['condition'].value_counts()[:5].values

sns.set_style(style="ticks")
fig= plt.figure(figsize=(5, 5))
plt.pie(cond_5_val, labels=cond_5_ind, autopct='%.2f%%', startangle=120, colors=sns.color_palette("deep6"))
plt.title('Proportion Top 5 Conditions', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
x.head()

In [None]:
# Segregating dataframe for analyzing individual condition
from wordcloud import WordCloud
import PIL
from IPython.display import Image
img_mask = PIL.Image.open('./Med1.jpg')

img_mask = np.array(img_mask)
# display(img_mask)
for condition in valid_condition:
    plt.figure(figsize=(10,8))
    wc = WordCloud(max_words=200, 
                   colormap = 'BuPu_r', mask=img_mask, background_color='black').generate(' '.join(x[x['condition']==condition]['review']))
    plt.axis('off')
    plt.imshow(wc)
    plt.title(f"Word Cloud for {condition}",fontsize=20)

## Data Preprocessing

In [None]:
x[x['review'].str.contains('#')].reset_index(drop=True).loc[0,"review"]

In [None]:
if '#' in x[x['review'].str.contains('#')].reset_index(drop=True).loc[0,"review"]:
    print("Yes")

In [None]:
for i, col in enumerate(x.columns):
    x.loc[:,col] = x.loc[:,col].str.replace('"','')
x[x['review'].str.contains('#')].reset_index(drop=True).loc[0,"review"]

In [None]:
# To set the width of the column to maximum
# pd.set_option('display.max_colwidth', -1)

In [None]:
x.head()

### Stopwords

What are stopwords?

Stopwords are the most common words in any natural language. For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document.

For example, in the English language, words like 'a', 'the', 'is', 'an', 'in', 'on', 'at', 'to', 'of', etc.

In [None]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
len(stopwords),stopwords

### Lemmatization

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

In [None]:
from IPython.display import Image, display
img=Image(filename='./stem_vs_lemma.png')
display(img)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

lemmatizer = WordNetLemmatizer()

def get_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith(('V','N','R')):
        return tag.lower()[0]
    else:
        return None
    
def lemmatize_word(words):
    pos = pos_tag(words)
    lemmatized_words = []
    for word, tag in pos:
        pos = get_tag(tag)
        if pos:
            lemmatized_words.append(lemmatizer.lemmatize(word,pos))
        else:
            lemmatized_words.append(word)
    return lemmatized_words

In [None]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
import string

punct = string.punctuation

In [None]:
def clean_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,'html.parser').get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. Convert words to lower case and tokenize them
    words = word_tokenize(review_text.lower())
    # 4. Remove Punctuation
    words = [word for word in words if word not in punct]
    # 5. Remove stopwords
    words = [w for w in words if w not in stopwords]
    # 6. Lemmatize words
    words = lemmatize_word(words)
    # 7. Join the words back into one string separated by space and return
    return " ".join(words)

In [None]:
x.head()

In [None]:
x.loc[:,'review_clean'] = x['review'].apply(clean_words)
x.head()

In [None]:
# review_text = "haha     jjjj    ksdkdk lsls"
    
# words = review_text.lower().split()
# words

## Creating Features and Target Variable

In [None]:
x

In [None]:
x_feat=x['review_clean']
y=x['condition']
x_feat.shape,y.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)
print(y, len(le.classes_),le.classes_,le.inverse_transform(y))
joblib.dump(le,'./label_encoder.pkl')

In [None]:
pip install joblib

In [None]:
import joblib

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_feat,y,test_size=0.2,random_state=0,stratify=y)
if not os.path.exists("./Data"):
    os.mkdir("./Data")
joblib.dump(y_train, './Data/label_train.pkl')
joblib.dump(y_test, './Data/label_test.pkl')
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
import os


## Bag of Words -> CountVectorizer

In [None]:
count_vectorizer = CountVectorizer(stop_words='english',analyzer='word')
len(count_vectorizer.get_stop_words()),count_vectorizer.get_stop_words()

In [None]:
count_train = count_vectorizer.fit_transform(x_train)
count_test = count_vectorizer.transform(x_test)

if not os.path.exists("./Data"):
    os.mkdir("./Data")
joblib.dump(count_train, './Data/count_train.pkl')
joblib.dump(count_test, './Data/count_test.pkl')

if not os.path.exists("./Bag of Words"):
    os.mkdir("./Bag of Words")
joblib.dump(count_vectorizer, "./Bag of Words/count_vectorizer.pkl")

In [None]:
print(count_train)

## Modeling

In [None]:
if not os.path.exists("./Models"):
    os.mkdir("./Models")

### Naive Bayes

In [None]:
mnb = MultinomialNB(alpha=0.1) # alpha is the smoothing parameter, a constant that is added to the frequency of each word
                               # to prevent zero probabilities
mnb.fit(count_train,y_train)
y_pred = mnb.predict(count_test)

joblib.dump(mnb,'./Models/mnb_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")

print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix Naive Bayes",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### K-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors=15, weights='distance', leaf_size=30, p=2)
knn.fit(count_train,y_train)
y_pred = knn.predict(count_test)

joblib.dump(knn, './Models/knn_w2v_model.pkl')

print(y_pred)
print(y_test)
print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix K-Nearest Neighbors",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### Support Vector Machines

In [None]:
svc = SVC(kernel='linear', C=10)
svc.fit(count_train,y_train)
y_pred = svc.predict(count_test)

joblib.dump(svc,'./Models/svc_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix Support Vector Machine - SVC",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### Passive Aggresive Classifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=1000, C=0.1, random_state=0)
pac.fit(count_train,y_train)
y_pred = pac.predict(count_test)

joblib.dump(pac,'./Models/pac_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"Accuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix Passive Aggresive Classifier",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000, C=10)
lr.fit(count_train,y_train)
y_pred = lr.predict(count_test)

joblib.dump(lr,'./Models/lr_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"Accuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix Logistic Regression",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=50,random_state=0) # 50 n_estimators in order to fasten the process
rf.fit(count_train,y_train)
y_pred = rf.predict(count_test)

joblib.dump(rf,'./Models/rf_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"Accuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix Random Forest",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

### XGBoost

In [None]:
xgb = xgboost.XGBClassifier(n_estimators=1000,objective='multi:softmax')
xgb.fit(count_train,y_train)
y_pred = xgb.predict(count_test)

joblib.dump(xgb,'./Models/xgb_countvec_model.pkl')

print(y_pred)
print(y_test)
print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
print("=====================================================\n")
cm =confusion_matrix(y_test,y_pred,labels=range(len(le.classes_)))
plt.figure(figsize=(10,8))
sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("Confusion Matrix XGBoost",fontsize=25)
plt.xlabel("Predicted Value",fontsize=20)
plt.ylabel("True Value",fontsize=20)
plt.tight_layout()

In [None]:
pip install xgboost


In [None]:
pip install --upgrade pip setuptools wheel


In [None]:
xgb.get_params(),xgb.n_estimators

## Bag of Words -> TFIDFVectorizer

In [None]:
List_gram=[]

In [None]:
from nltk.tokenize import word_tokenize
tfidf_vectorizer_unigram = TfidfVectorizer(tokenizer=word_tokenize,stop_words='english',token_pattern=None,ngram_range=(1,1))
len(tfidf_vectorizer_unigram.get_stop_words()), tfidf_vectorizer_unigram.get_stop_words()

In [None]:
tfidf_train_unigram = tfidf_vectorizer_unigram.fit_transform(x_train)
tfidf_test_unigram = tfidf_vectorizer_unigram.transform(x_test)

if not os.path.exists("./Data"):
    os.mkdir("./Data")
joblib.dump(tfidf_train_unigram, './Data/tfidf_train_unigram.pkl')
joblib.dump(tfidf_test_unigram, './Data/tfidf_test_unigram.pkl')

if not os.path.exists("./Bag of Words"):
    os.mkdir("./Bag of Words")
joblib.dump(tfidf_vectorizer_unigram,'./Bag of Words/tfidf_vectorizer_(1,1)-gram_unigram.pkl')
List_gram.append(("TFIDF (1,1)-gram_Unigram",tfidf_train_unigram,tfidf_test_unigram,"(1,1)-gram_unigram"))

In [None]:
print(tfidf_train_unigram)

## TFIDFVectorizer: (1,2)-gram

In [None]:
from nltk.tokenize import word_tokenize
tfidf_vectorizer_ubigram = TfidfVectorizer(tokenizer=word_tokenize,stop_words='english',token_pattern=None, ngram_range=(1,2))

In [None]:
tfidf_train_ubigram = tfidf_vectorizer_ubigram.fit_transform(x_train)
tfidf_test_ubigram = tfidf_vectorizer_ubigram.transform(x_test)

if not os.path.exists("./Data"):
    os.mkdir("./Data")
joblib.dump(tfidf_train_ubigram, './Data/tfidf_train_ubigram.pkl')
joblib.dump(tfidf_test_ubigram, './Data/tfidf_test_ubigram.pkl')

if not os.path.exists("./Bag of Words"):
    os.mkdir("./Bag of Words")
joblib.dump(tfidf_vectorizer_ubigram,'./Bag of Words/tfidf_vectorizer_(1,2)-gram.pkl')
List_gram.append(("TFIDF (1,2)-Gram",tfidf_train_ubigram,tfidf_test_ubigram,"(1,2)-gram"))

In [None]:
print(tfidf_train_ubigram)

## TFIDFVectorizer: (1,3)-gram

In [None]:
# from nltk.tokenize import word_tokenize
# tfidf_vectorizer_ubitrigram = TfidfVectorizer(tokenizer=word_tokenize,stop_words='english',token_pattern=None, ngram_range=(1,3))

In [None]:
# tfidf_train_ubitrigram = tfidf_vectorizer_ubitrigram.fit_transform(x_train)
# tfidf_test_ubitrigram = tfidf_vectorizer_ubitrigram.transform(x_test)

# if not os.path.exists("./Bag of Words"):
#     os.mkdir("./Bag of Words")
# joblib.dump(tfidf_vectorizer_ubitrigram,'./Bag of Words/tfidf_vectorizer_(1,3)-gram.pkl')
# List_gram.append(("TFIDF (1,3)-Gram",tfidf_train_ubitrigram,tfidf_test_ubitrigram,"(1,3)-gram"))

In [None]:
# print(tfidf_train_ubitrigram)

### Naive Bayes

In [None]:
mnb_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    mnb = MultinomialNB(alpha=0.1)
    mnb.fit(x_tr,y_train)
    y_pred = mnb.predict(x_ts)

    joblib.dump(mnb,f'./Models/mnb_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix Naive Bayes - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    mnb_model.append(mnb) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### K-Nearest Neighbors

In [None]:
knn_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    knn = KNeighborsClassifier(n_neighbors=15, weights='distance', leaf_size=30, p=2)
    knn.fit(x_tr,y_train)
    y_pred = knn.predict(x_ts)

    joblib.dump(knn,f'./Models/knn_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix K-Nearest Neighbors - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    knn_model.append(knn) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### Support Vector Machines

In [None]:
svc_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    svc = SVC(kernel='linear', C=10)
    svc.fit(x_tr,y_train)
    y_pred = svc.predict(x_ts)

    joblib.dump(svc,f'./Models/svc_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix Support Vector Machine - SVC - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    svc_model.append(svc) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### Passive Aggresive Classifier

In [None]:
pac_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    pac = PassiveAggressiveClassifier(max_iter=1000, C=0.1, random_state=0)
    pac.fit(x_tr,y_train)
    y_pred = pac.predict(x_ts)

    joblib.dump(pac,f'./Models/pac_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix Passive Aggresive Classifier - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    pac_model.append(pac) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### Logistic Regression

In [None]:
lr_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    lr = LogisticRegression(max_iter=1000, C=10)
    lr.fit(x_tr,y_train)
    y_pred = lr.predict(x_ts)

    joblib.dump(lr,f'./Models/lr_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix Logistic Regression - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    lr_model.append(lr) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### Random Forest

In [None]:
rf_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    rf = RandomForestClassifier(n_estimators=50,random_state=0) # 50 n_estimators in order to fasten the process
    rf.fit(x_tr,y_train)
    y_pred = rf.predict(x_ts)

    joblib.dump(rf,f'./Models/rf_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix Random Forest - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    rf_model.append(rf) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

### XGBoost

In [None]:
xgb_model=[]
for name, x_tr, x_ts, modname in List_gram:
    print(f"{name}\n")
    xgb = xgboost.XGBClassifier(n_estimators=1000,objective='multi:softmax')
    xgb.fit(x_tr,y_train)
    y_pred = xgb.predict(x_ts)

    joblib.dump(xgb,f'./Models/xgb_tfidf_{modname}_model.pkl')

    print(y_pred)
    print(y_test)
    print(f"\nAccuracy score: {accuracy_score(y_test,y_pred)}\n")
    
    print(f"Classification report:\n{classification_report(y_test,y_pred,digits=3)}")
    print("=====================================================\n")
    
    cm =confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm,annot=True,fmt='d',cmap='coolwarm',xticklabels=le.classes_,yticklabels=le.classes_)
    plt.title(f"Confusion Matrix XGBoost - {name}",fontsize=25)
    plt.xlabel("Predicted Value",fontsize=20)
    plt.ylabel("True Value",fontsize=20)
    plt.tight_layout()
    xgb_model.append(xgb) # 0 : Unigram (1,1), 1 : Unigram-Bigram (1,2)

In [None]:
xgb.get_params()

In [None]:
# def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
#     labelid = list(classifier.classes_).index(classlabel)
#     feature_names = vectorizer.get_feature_names_out()
#     topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

#     for coef, feat in topn:
#         print(classlabel, feat, coef)

In [None]:
list(le.transform(["Acne"]))

In [None]:
pac.coef_.shape

In [None]:
lr.coef_.shape

In [None]:
mnb.n_features_in_

In [None]:
pac.coef_.shape

In [None]:
tfidf_vectorizer_ubigram.get_feature_names_out().shape

In [None]:
xgb.feature_importances_

In [None]:
rf.feature_importances_.shape

In [None]:
le.transform(["Birth Control"])[0]

In [None]:
rf.n_features_in_

## Predict

In [None]:
x.iloc[100,1]

In [None]:
import pandas as pd

main_data = pd.read_csv('./drug review dataset drugs.com/DrugsComPatient_raw.csv')
main_data.head()

In [None]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
punct = string.punctuation

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

lemmatizer = WordNetLemmatizer()

def get_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith(('V','N','R')):
        return tag.lower()[0]
    else:
        return None
    
def lemmatize_word(words):
    pos = pos_tag(words)
    lemmatized_words = []
    for word, tag in pos:
        pos = get_tag(tag)
        if pos:
            lemmatized_words.append(lemmatizer.lemmatize(word,pos))
        else:
            lemmatized_words.append(word)
    return lemmatized_words

In [None]:
def clean_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,'html.parser').get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. Convert words to lower case and tokenize them
    words = word_tokenize(review_text.lower())
    # 4. Remove Punctuation
    words = [word for word in words if word not in punct]
    # 5. Remove stopwords
    words = [w for w in words if w not in stopwords]
    # 6. Lemmatize words
    words = lemmatize_word(words)
    # 7. Join the words back into one string separated by space and return
    return " ".join(words)

In [None]:
def extract_top_drugs(label):
    data_top = main_data[(main_data['rating']>=9) & (main_data['usefulCount']>=100)].sort_values(by=['rating','usefulCount'],ascending=[False, False])
    data_top.head()
    drug_list = data_top[data_top['condition']==label]['drugName'][:3].tolist()
    return drug_list

In [None]:
import joblib
# Load LabelEncoder
le = joblib.load('./label_encoder.pkl')

# Load TFIDFVectorizer
tfidf_ubigram = joblib.load('./Bag of Words/tfidf_vectorizer_(1,2)-gram.pkl')

# Load Model
models = {
    "Naive Bayes": joblib.load("./Models/mnb_tfidf_(1,2)-gram_model.pkl"),
    "K-Nearest Neighbors": joblib.load("./Models/knn_tfidf_(1,2)-gram_model.pkl"),
    "Support Vector Machine": joblib.load("./Models/svc_tfidf_(1,2)-gram_model.pkl"),
    "Passive Aggressive Classifier": joblib.load("./Models/pac_tfidf_(1,2)-gram_model.pkl"),
    "Logistic Regression": joblib.load("./Models/lr_tfidf_(1,2)-gram_model.pkl"),
    "Random Forest": joblib.load("./Models/rf_tfidf_(1,2)-gram_model.pkl"),
    "XGBoost": joblib.load("./Models/xgb_tfidf_(1,2)-gram_model.pkl")
}

In [None]:
models['Logistic Regression'].coef_.shape

In [None]:
models['Support Vector Machine'].coef_.shape

In [None]:
model = models['XGBoost']
def predict_condition(text):
    text = [clean_words(text)]
    text = tfidf_ubigram.transform(text)
    pred = model.predict(text)[0]
    return le.inverse_transform([pred])[0]

In [None]:
sentences = ["I have situational depression, never dealt with it before. Wellbutrin had definitely helped get me out of bed each day. The &quot;situation&quot; is becoming better as well but I find my emotions aren&#039;t as out of whack. I cry less &amp; get angry less. I do feel it made me more anxious which I&#039;m already dealing with &amp; taking Buspirone for. I didn&#039;t experience many side effects BUT one that is hard to deal with but because it seems to make me get through each day a little better, I deal with it. It makes my throat feel closed, which is very annoying. I feel often like I can&#039;t breathe well &amp; that I can&#039;t swallow or that something is stuck in my throat. I&#039;ve not taken any other pills on several days to determine that it was the Wellbutrin that did it &amp; it is."]

In [None]:
prediction = predict_condition(sentences[0])

top_drugs = extract_top_drugs(prediction)

print("Condition:",prediction)
print("Top Drugs:")
for i, drug in enumerate(top_drugs):
    print(f"{i+1}. {drug}")

In [None]:
models['Support Vector Machine'].coef_[2][0].shape

In [None]:
models['Logistic Regression'].coef_[1].shape

In [None]:
models['Support Vector Machine'].coef_[0].toarray().shape

In [None]:
models['Naive Bayes'].feature_log_prob_.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_top_features(vectorizer, classifier, class_label, le, top_features=10):
    feature_names = vectorizer.get_feature_names_out()
    class_index = le.transform([class_label])[0]
    
    if isinstance(classifier, (LogisticRegression, PassiveAggressiveClassifier)):
        coef = classifier.coef_[class_index]
    elif isinstance(classifier, MultinomialNB):
        coef = classifier.feature_log_prob_[class_index]
    elif isinstance(classifier, (RandomForestClassifier, xgboost.XGBClassifier, KNeighborsClassifier)):
        print(f"The model '{type(classifier).__name__}' does not provide direct feature importances.")
        return
    else:
        print(f"The model '{type(classifier).__name__}' has no suitable attribute for extracting feature importances. Unable to plot the top feature for label {class_label}.")
        return
    
    # if len(coef.shape) > 1:
    #     coef = coef[class_index]

    top_positive_coefficients = sorted(zip(coef, feature_names), key=lambda x: x[0], reverse=True)[:top_features]
    top_negative_coefficients = sorted(zip(coef, feature_names), key=lambda x: x[0])[:top_features]
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    top_coefficients = [coef for coef, feat in top_positive_coefficients]
    top_features_names = [feat for coef, feat in top_positive_coefficients]
    sns.barplot(x=top_coefficients, y=top_features_names, palette="Blues_d",hue=top_features_names)
    plt.title(f"Top {top_features} Positive Features ({class_label})")
    
    plt.subplot(1, 2, 2)
    top_coefficients = [coef for coef, feat in top_negative_coefficients]
    top_features_names = [feat for coef, feat in top_negative_coefficients]
    sns.barplot(x=top_coefficients, y=top_features_names, palette="Reds_d",hue=top_features_names)
    plt.title(f"Top {top_features} Negative Features ({class_label})")
    
    plt.tight_layout()
    plt.show()
    
vectorizer = tfidf_ubigram
classifier = models['Support Vector Machine']
class_label = 'Depression'
plot_top_features(vectorizer, classifier, class_label,le)