# JD Drug Recommendation System 

### Data Cleaning

In this section, the system will read from the raw dataset and perform data cleaning.

In [2]:
# Pandas is a data analysis tool that allows importing data from other file formats
# Numpy facilitate advanced mathematical operation on large numbers of data
# Pyplot is a module in matplotlib that manipulate elements of a figure, create figure
# Seaborn is used for making statistical graphics.
# plot_decision_regions is a function for plotting decision regions of classifiers in 1 or 2 dimensions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

# ggplot is used to map data to visual objects that make up the visualization
# we specified the figure format to be svg format
# we use matplotlib to renders the figure inline in this jupyter notebook
plt.style.use('ggplot')
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [3]:
# we start reading dataset from a csv file
# please noted that if you are not cloning the file in github then please replace with your path in the path given below
df_train = pd.read_csv(r"..\Modified\dataset\drugsComTrain_raw.tsv", sep=None, engine='python')

In [4]:
#  to check out data types, missing values and more
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   161297 non-null  int64  
 1   drugName     161297 non-null  object 
 2   condition    160398 non-null  object 
 3   review       161297 non-null  object 
 4   rating       161297 non-null  float64
 5   date         161297 non-null  object 
 6   usefulCount  161297 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 8.6+ MB


In [5]:
# returns top n rows of a DataFrame or Series where n is a user input value
n = 3
df_train.head(n) # default n value is 5

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17


In [6]:
# to delete/exclude unnecessary columns in the dataset
del df_train['Unnamed: 0']
del df_train['date']

In [7]:
# to list out the number of drugs and condition in the dataset
print("number of drugs:", len(df_train['drugName'].unique()))
print("number of conditions:", len(df_train['condition'].unique()))

number of drugs: 3436
number of conditions: 885


In [8]:
# to group data according to their condition and list out the first 30 popular use drugs
drug_per_condition = df_train.groupby(['condition'])['drugName'].nunique().sort_values(ascending=False)
drug_per_condition[:30]

condition
Not Listed / Othe                             214
Pain                                          200
Birth Control                                 172
High Blood Pressure                           140
Acne                                          117
Depression                                    105
Rheumatoid Arthritis                           98
Diabetes, Type 2                               89
Allergic Rhinitis                              88
Osteoarthritis                                 80
Bipolar Disorde                                80
Anxiety                                        78
Insomnia                                       78
Abnormal Uterine Bleeding                      74
Migraine                                       59
Psoriasis                                      58
Endometriosis                                  57
3</span> users found this comment helpful.     57
ADHD                                           55
Asthma, Maintenance                     

In [9]:
# to replace invalid data in condition column with NaN
df_train.loc[df_train['condition'].str.contains('</span>',case=False, na=False), 'condition'] = 'NAN'
df_train['condition'].replace('NAN', np.NaN, inplace=True)
df_train['condition'].replace('Not Listed / Othe', np.NaN, inplace=True)

# to create a dictionary with drugname:condition to fill NaN
dictionary=df_train.set_index('drugName')['condition'].to_dict()
len(dictionary)

3436

In [10]:
# to fill NaN value with correct condition names using created dictionary
df_train.condition.fillna(df_train.drugName.map(dictionary), inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   drugName     161297 non-null  object 
 1   condition    161197 non-null  object 
 2   review       161297 non-null  object 
 3   rating       161297 non-null  float64
 4   usefulCount  161297 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 6.2+ MB


In [11]:
# to drop rows with still missing values in condition (100 rows = 0.0006% of total data)
df_train.dropna(inplace=True)

In [12]:
drug_per_condition = df_train.groupby(['condition']s)['drugName'].nunique().sort_values(ascending=False)
drug_per_condition[:30]

SyntaxError: invalid syntax (<ipython-input-12-b54f96778742>, line 1)

In [None]:
# to list out top 15 number of drugs/conditions
drug_per_condition[:15].plot(kind="bar", figsize = (14,6), fontsize = 10, color="#a1c995")
plt.xlabel("", fontsize = 20)
plt.ylabel("", fontsize = 20)
plt.title("Top 15 Number of Drugs / Condition", fontsize = 20)
plt.savefig(r"../Modified/fig/top15_condition.svg")

In [13]:
# to select conditions with less than 11 drugs
condition_1=drug_per_condition[drug_per_condition<=10].keys()
condition_1

Index(['Allergies', 'Autism', 'Postpartum Depression', 'Alzheimer's Disease',
       'Constipation, Acute', 'Uveitis', 'Malaria',
       'Primary Immunodeficiency Syndrome', 'Chronic Fatigue Syndrome',
       'Social Anxiety Disorde',
       ...
       '77</span> users found this comment helpful.',
       'Secondary Cutaneous Bacterial Infections',
       '76</span> users found this comment helpful.', 'Mononucleosis',
       'Mixed Connective Tissue Disease', 'Sepsis',
       '72</span> users found this comment helpful.',
       'Microscopic polyangiitis', 'Short Stature',
       'Epicondylitis, Tennis Elbow'],
      dtype='object', name='condition', length=678)

In [14]:
# to delete condition with less than 11 drugs
df_train1=df_train[~df_train['condition'].isin(condition_1)]
df_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148387 entries, 1 to 161296
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   drugName     148387 non-null  object 
 1   condition    148387 non-null  object 
 2   review       148387 non-null  object 
 3   rating       148387 non-null  float64
 4   usefulCount  148387 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 6.8+ MB


In [15]:
drug_per_condition[:200]

condition
Not Listed / Othe                   214
Pain                                200
Birth Control                       172
High Blood Pressure                 140
Acne                                117
                                   ... 
Head Lice                            11
Gout                                 11
Nausea/Vomiting, Postoperative       11
Hyperhidrosis                        11
Sexual Dysfunction, SSRI Induced     11
Name: drugName, Length: 200, dtype: int64

In [None]:
input your interested topic: what symptoms what disesae