# Data Science Assignment

### The NLP Dataset

In [2]:
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Load Data From CSV File

In [3]:
# Importing the dataset
dataset1 = pd.read_csv('Data.csv')
dataset = dataset1[['Tweet', 'ADR_label']]
dataset.head()                    # First five dataset

Unnamed: 0,Tweet,ADR_label
0,Intravenous azithromycin-induced ototoxicity.,1
1,"Immobilization, while Paget's bone disease was...",1
2,Unaccountable severe hypercalcemia in a patien...,1
3,METHODS: We report two cases of pseudoporphyri...,1
4,METHODS: We report two cases of pseudoporphyri...,1


In [4]:
# Shape of the Dataset
dataset.shape

(23516, 2)

### Text Cleaning

In [5]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')    # download the irrelavent words. (e.g - this, that, it etc)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dipes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
dataset.dtypes   # Types od Dataset

Tweet        object
ADR_label     int64
dtype: object

In [7]:
dataset.count

<bound method DataFrame.count of                                                    Tweet  ADR_label
0          Intravenous azithromycin-induced ototoxicity.          1
1      Immobilization, while Paget's bone disease was...          1
2      Unaccountable severe hypercalcemia in a patien...          1
3      METHODS: We report two cases of pseudoporphyri...          1
4      METHODS: We report two cases of pseudoporphyri...          1
...                                                  ...        ...
23511  At autopsy, the liver was found to be small, s...          0
23512  Physical exam revealed a patient with aphasia,...          0
23513  At the time when the leukemia appeared seven o...          0
23514  The American Society for Regional Anesthesia a...          0
23515  Concomitant administration of estradiol result...          0

[23516 rows x 2 columns]>

## Classify the Tweet into the 2 segments

In [8]:
# dataset, whose ADR_label = 1
ADR_label1 = dataset[dataset['ADR_label'] == 1]
ADR_label1[0:10]

Unnamed: 0,Tweet,ADR_label
0,Intravenous azithromycin-induced ototoxicity.,1
1,"Immobilization, while Paget's bone disease was...",1
2,Unaccountable severe hypercalcemia in a patien...,1
3,METHODS: We report two cases of pseudoporphyri...,1
4,METHODS: We report two cases of pseudoporphyri...,1
5,"Naproxen, the most common offender, has been a...",1
6,RESULTS: A 44-year-old man taking naproxen for...,1
7,RESULTS: A 44-year-old man taking naproxen for...,1
8,RESULTS: A 44-year-old man taking naproxen for...,1
9,RESULTS: A 44-year-old man taking naproxen for...,1


In [9]:
# dataset, whose ADR_label = 0
ADR_label0 = dataset[dataset['ADR_label'] == 0]
ADR_label0[0:10]

Unnamed: 0,Tweet,ADR_label
6822,"""Retinoic acid syndrome"" was prevented with sh...",0
6823,BACKGROUND: External beam radiation therapy of...,0
6824,"Although the enuresis ceased, she developed th...",0
6825,A 42-year-old woman had uneventful bilateral l...,0
6826,"A 16-year-old girl with erosive, polyarticular...",0
6827,Resection and use of a cyclooxygenase-2 inhibi...,0
6828,Differences in interpretation by patients and ...,0
6829,We present a case report of a cytomegalovirus ...,0
6830,Dihydropyrimidine dehydrogenase (DPD) is the r...,0
6831,"The decedent was also prescribed tramadol, gab...",0


# Data pre-processing and selection

### A Sample DataSet

In [10]:
# Select line 7 for the sample test
review = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][7])            # WE dan't want to remove the words.
review = review.lower()                                           # Using this code all the upper letter will be change to lower.
review = review.split()                                           # This will split the dataset.
ps = PorterStemmer()                                              # The get root value of the words (eg. Loved = love ).
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Wil take all the values except stopwords.
review = ' '.join(review)
review

'result year old man take naproxen chronic low back pain year old woman oxaprozin rheumatoid arthriti present tens bulla cutan fragil face back hand'

# Append All the Lines 

## Creating the Bag of Words

In [11]:
corpus = []
for i in range(0, 23516):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][i])            # WE dan't want to remove the words like letter words.
    review = review.lower()                                           # Using this code all the upper letter will be change to lower.
    review = review.split()                                           # This will split the dataset.
    ps = PorterStemmer()                                              # The get root value of the words (eg. Loved = love ).
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Wil take all the values except stopwords.
    review = ' '.join(review)                                         # It join all the bag of words model in a single line. 
    corpus.append(review)

In [12]:
corpus

['intraven azithromycin induc ototox',
 'immobil paget bone diseas present perhap enhanc activ dihydrotachysterol rifampicin could led increas calcium releas circul',
 'unaccount sever hypercalcemia patient treat hypoparathyroid dihydrotachysterol',
 'method report two case pseudoporphyria caus naproxen oxaprozin',
 'method report two case pseudoporphyria caus naproxen oxaprozin',
 'naproxen common offend associ dimorph clinic pattern pct like present one simul erythropoiet protoporphyria pediatr popul',
 'result year old man take naproxen chronic low back pain year old woman oxaprozin rheumatoid arthriti present tens bulla cutan fragil face back hand',
 'result year old man take naproxen chronic low back pain year old woman oxaprozin rheumatoid arthriti present tens bulla cutan fragil face back hand',
 'result year old man take naproxen chronic low back pain year old woman oxaprozin rheumatoid arthriti present tens bulla cutan fragil face back hand',
 'result year old man take naproxe

## Bag of Words model

In [13]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 12000)     # The bag of words model will convert to numeric approch.
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values               # The test dataset

### Train/Test dataset

In [14]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (18812, 12000) (18812,)
Test set: (4704, 12000) (4704,)


## Modeling (Naive_bayes with Scikit-learn)

In [15]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)    # Fitting train and test dataset 

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred[0:5]                         # The first predicted values

array([1, 1, 1, 1, 0], dtype=int64)

## Confusion_matrix

In [17]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1775, 1558],
       [ 138, 1233]], dtype=int64)

## Accuracy

In [18]:
# Accuracy of the test
accuracy = (1775+1233)/4704             # (TP+TN)/Total Values
accuracy

0.6394557823129252

## Classification_Report

In [19]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.53      0.68      3333
           1       0.44      0.90      0.59      1371

    accuracy                           0.64      4704
   macro avg       0.68      0.72      0.63      4704
weighted avg       0.79      0.64      0.65      4704



## The f1_score from sklearn library

In [20]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_test, y_pred, average='weighted')
print("Avg F1-score: %.4f" % f1_score)

Avg F1-score: 0.6522


# Thank You