**Install packages**

In [None]:
!pip install pandas
!pip install numpy
!pip install re

**Import packages**

In [1]:
import pandas as pd
import numpy as np
import re

**Preprocess data**

In [15]:
#Preprocessing

#### Loading data ####

def loadDatasets(filepath1, filepath2):
    df1 = pd.read_csv(filepath1)
    df2 = pd.read_csv(filepath2)
    return pd.concat([df1, df2])

drug_reviews = loadDatasets("/Users/evasahlholdt/Desktop/MA/NLP/Exam/Code/NLP_exam/datasets/drugsComTest_raw.csv", "/Users/evasahlholdt/Desktop/MA/NLP/Exam/Code/NLP_exam/datasets/drugsComTrain_raw.csv")

#### General cleaning #####

def generalCleaning(dataset):
    dataset = dataset.drop(columns = ["date", "usefulCount"]) #drop unused columns
    dataset = dataset.dropna(subset = ["condition", "drugName"]) #drop rows when there are NAs in relevant columns
    dataset = dataset.drop_duplicates(subset = ["uniqueID"]) #remove duplicates
    dataset = dataset.drop_duplicates(subset = ["review"]) #there appears to be reviews that double due to generic + brand drug names
    dataset = dataset[dataset["condition"].str.contains("found this comment helpful") == False] #remove rows missing important information
    return(dataset)

drug_reviews = generalCleaning(drug_reviews)

##### Create depression dataset #####

def createDepData(dataset):
    dataset = dataset[dataset['condition'].str.contains("Depres")] #select conditions which are concerned with depression
    dataset = dataset[dataset["condition"].str.contains("Neurotic Depression") == False] #drop neurotic depression
    dataset = dataset[dataset["condition"].str.contains("Postpartum Depression") == False] #drop postpartum depression
    dataset = dataset[dataset["condition"].str.contains("Persistent Depressive Disorde") == False] #drop persistent depression
    dataset.loc[dataset['condition'].str.contains('Depres'), 'condition'] = 'Depression' #rename all depression conditions to one class
    return(dataset)

drug_reviews_dep = createDepData(drug_reviews)

#### Cleaning the reviews #####

#Function for cleaning the reviews for HTML codes (substitute with corresponding characters)

def cleanReviews(dataset):
    dataset['review'] = dataset['review'].str.replace('&#039;',"'")
    dataset['review'] = dataset['review'].str.replace('&rsquo;',"'")
    dataset['review'] = dataset['review'].str.replace('&acute;',"'")
    dataset['review'] = dataset['review'].str.replace('&lsquo;',"'")
    dataset['review'] = dataset['review'].str.replace('&amp;',"and")
    dataset['review'] = dataset['review'].str.replace('&quot;','"')
    dataset['review'] = dataset['review'].str.replace('&ldquo;','"')
    dataset['review'] = dataset['review'].str.replace('&rdquo;','"')
    dataset['review'] = dataset['review'].str.replace('&gt;','>')
    dataset['review'] = dataset['review'].str.replace('&lt;','<')
    dataset['review'] = dataset['review'].str.replace('&ge;','≥')
    dataset['review'] = dataset['review'].str.replace('&pound;','pound')
    dataset['review'] = dataset['review'].str.replace('&hellip;','...')
    dataset['review'] = dataset['review'].str.replace('&nbsp;',' ')
    dataset['review'] = dataset['review'].str.replace('&deg','degree')
    dataset['review'] = dataset['review'].str.replace('&bull;','-')
    dataset['review'] = dataset['review'].str.replace('&ndash;','-')
    return(dataset)

drug_reviews_dep = cleanReviews(drug_reviews_dep)

**Create SSRI and SNRI drug classes**

In [None]:
#First, extract all drugs from the drugName column.
dep_drugs = drug_reviews_dep["drugName"].unique().tolist()
print(dep_drugs) #printing the list allows us to just copy paste the drugs into str.replace, and then we just change all occurences of , , to |.
len(dep_drugs)

def extractDrugClass(dataset):
    dataset['drugName'] = dataset['drugName'].str.replace('Zoloft|Sertraline|Celexa|Citalopram|Lexapro|Escitalopram|Prozac|Fluoxetine|Trintellix|Vortioxetine|Viibryd|Vilazodone|Paxil|Paroxetine|Luvox|Fluvoxamine|Symbyax|Weekly','SSRI', case = False)
    dataset['drugName'] = dataset['drugName'].str.replace('Fetzima|Levomilnacipran|Effexor|Venlafaxine|Cymbalta|Duloxetine|Pristiq|Desvenlafaxine','SNRI', case = False)
    new_dataset  = dataset[dataset['drugName'].str.contains("SSRI|SNRI") == True]
    new_dataset['drugName'] = new_dataset['drugName'].str.replace('SNRI XR', 'SNRI')
    new_dataset['drugName'] = new_dataset['drugName'].str.replace('SSRI CR', 'SSRI')
    new_dataset['drugName'] = new_dataset['drugName'].str.replace('SSRI SSRI', 'SSRI')
    new_dataset = new_dataset[new_dataset["drugName"].str.contains("SSRI / olanzapine") == False]
    return(new_dataset)

SSRI_SNRI_all = extractDrugClass(drug_reviews_dep)

**Create column rating_class: For each drug class and whether the reviews are positive or negative**

In [None]:
def drugRating(dataset):
    dataset['rating'] = dataset['rating'].replace([1,2,3,4,5], 'negative')
    dataset['rating'] = dataset['rating'].replace([6,7,8,9,10], 'positive')
    dataset.loc[(dataset['rating'] == 'negative') & (dataset['drugName'] == 'SSRI'), 'rating_class'] = 'negative_SSRI'
    dataset.loc[(dataset['rating'] == 'positive') & (dataset['drugName'] == 'SSRI'), 'rating_class'] = 'positive_SSRI'
    dataset.loc[(dataset['rating'] == 'negative') & (dataset['drugName'] == 'SNRI'), 'rating_class'] = 'negative_SNRI'
    dataset.loc[(dataset['rating'] == 'positive') & (dataset['drugName'] == 'SNRI'), 'rating_class'] = 'positive_SNRI'
    return dataset

SSRI_SNRI_all = drugRating(SSRI_SNRI_all)


**Relabelling drug names in reviews into "drug"**

In [None]:
#Duplicate dataset to not to overwrite
SSRI_SNRI_all_copy = SSRI_SNRI_all.copy()

def relabelDrugToDrug(dataset):
    dataset['review'] = dataset['review'].str.replace('Zoloft|Sertraline|Celexa|Citalopram|Lexapro|Escitalopram|Prozac|Fluoxetine|Trintellix|Vortioxetine|Viibryd|Vilazodone|Paxil|Paroxetine|Luvox|Fluvoxamine|Symbyax|Weekly|Fetzima|Levomilnacipran|Effexor|Venlafaxine|Cymbalta|Duloxetine|Pristiq|Desvenlafaxine|Brintellix|Elavil|Amitriptyline|Sinequan|Doxepin|Vivactil|Protriptyline|Imipramine|Asendin|Amoxapine|Norpramin|Desipramine|Ludiomil|Maprotiline|Pamelor|Nortriptyline|Anafranil|Clomipramine|Limbitrol|chlordiazepoxide|Desyrel|Oleptro|Trazodone|Nefazodone|Serzone|Remeron|SolTab|Mirtazapine|Parnate|Tranylcypromine|Marplan|Isocarboxazid|Nardil|Phenelzine|Emsam|Selegiline|Abilify|Aripiprazole|Seroquel|Seroquel|Quetiapine|Risperdal|Risperidone|Zyprexa|Olanzapine|Rexulti|Brexpiprazole|Paliperidone|Xanax|Niravam|Alprazolam|Lamotrigine|Tramadol|Provigil|Modafinil|Nuvigil|Armodafinil|Vyvanse|Lisdexamfetamine|Methylin|Methylphenidate|Strattera|Atomoxetine|Wellbutrin|Aplenzin|Bupropion|Budeprion|SSRI|SNRI','drug', case = False)
    dataset['review'] = dataset['review'].str.replace("St. john's wort|Tryptophan|S-adenosylmethionine|Niacin|Lithium|Deplin|L-methylfolate",'dietary supplement', case = False)
    return dataset

SSRI_SNRI_all = relabelDrugToDrug(SSRI_SNRI_all_copy)

**Remove additional drug-name related labels**

In [19]:
def label_replacer(dataset, label):
    dataset['review'] = dataset['review'].str.replace(r'(?s)\s' + label + '\s', ' ', case = False, regex = True) #white spaces on both sides
    dataset['review'] = dataset['review'].str.replace(r'(?s)\drug' + label + '\s', ' ', case = False, regex = True) #when with drug and white space
    dataset['review'] = dataset['review'].str.replace(r'(?s)\W' + label + '\W', ' ', case = False, regex = True) #when special characters around
    dataset['review'] = dataset['review'].str.replace(r'(?s)\d+' + label + '\s', ' ', case = False, regex = True) #when number before
    return dataset

SSRI_SNRI_all = label_replacer(SSRI_SNRI_all, "XL")
SSRI_SNRI_all = label_replacer(SSRI_SNRI_all, "XR")
SSRI_SNRI_all = label_replacer(SSRI_SNRI_all, "CR")
SSRI_SNRI_all = label_replacer(SSRI_SNRI_all, "SR")

**Relabel mentions of dosages into "dosis"**

In [20]:
def relabelDoses(dataset):
    dataset.review = dataset.apply(lambda row: re.sub(r'\d+mg', 'dosis', row.review).lower(), 1)
    dataset.review = dataset.apply(lambda row: re.sub(r'\d+ mg', 'dosis', row.review).lower(), 1)
    return(dataset)

SSRI_SNRI_all = relabelDoses(SSRI_SNRI_all)

**Save data**

In [22]:
SSRI_SNRI_all.to_csv(r'/Users/evasahlholdt/Desktop/MA/NLP/Exam/Code/NLP_exam/datasets/SSRI_SNRI_all.csv', index = False, header = True)