# Naive Bayes in NLP

**Naive Bayes**
* Introduction: https://en.wikipedia.org/wiki/Naive_Bayes_classifier
* Library: http://scikit-learn.org/stable/modules/naive_bayes.html
---

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Source: https://github.com/sjwhitworth/golearn/blob/master/examples/datasets/tennis.csv
df = pd.read_csv('https://raw.githubusercontent.com/sjwhitworth/golearn/master/examples/datasets/tennis.csv')
df

#### Compute prior probability

In [None]:
P_yes = (df['play'] == 'yes').sum() * 1. / df.shape[0]
P_no  = (df['play'] == 'no').sum()  * 1. / df.shape[0]
print (P_yes, P_no)

#### Compute prior probability

In [None]:
np_outlook = np.array([[((df.play == c).values & (df.outlook == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['sunny', 'rainy', 'overcast']])
print (np_outlook)

s = np_outlook.sum(axis=1)
print (s)
np_outlook = np_outlook * 1. / s.reshape((-1,1))
print(np_outlook)
df_outlook = pd.DataFrame(np_outlook,
                          columns=['yes', 'no'])
df_outlook.index = ['sunny', 'rainy', 'overcast']
df_outlook
# here overcast prob is 0 ,but it doesnot means it won't occur ,here i think comes why we need more data size 

In [None]:
np_temp = np.array([[((df.play == c).values & (df.temp == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['hot', 'mild', 'cool']])

s = np_temp.sum(axis=1)
np_temp = np_temp * 1. / s.reshape((-1,1))

df_temp = pd.DataFrame(np_temp,
                          columns=['yes', 'no'])
df_temp.index = ['hot', 'mild', 'cool']
df_temp

In [None]:
np_humidity = np.array([[((df.play == c).values & (df.humidity == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['high', 'normal']])

s = np_humidity.sum(axis=1)
np_humidity = np_humidity * 1. / s.reshape((-1,1))

df_humidity = pd.DataFrame(np_humidity,
                          columns=['yes', 'no'])
df_humidity.index = ['high', 'normal']
df_humidity

In [None]:
np_windy = np.array([[((df.play == c).values & (df.windy == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in [True, False]])

s = np_windy.sum(axis=1)
np_windy = np_windy * 1. / s.reshape((-1,1))

df_windy = pd.DataFrame(np_windy,
                          columns=['yes', 'no'])
df_windy.index = ['True', 'False']
df_windy

#### Testing

In [None]:
x = ['sunny', 'hot', 'normal', False]

print (df_outlook.loc['sunny'])
print (df_temp.loc['hot'])

df_outlook.loc['sunny'] * df_temp.loc['hot']

In [None]:
df_outlook.loc['sunny'] * df_temp.loc['hot'] * df_humidity.loc['normal'] * df_windy.loc['False']

##### Test 2

In [None]:
x = ['overcast', 'hot', 'normal', False]
df_outlook.loc['overcast'] * df_temp.loc['hot'] * df_humidity.loc['normal'] * df_windy.loc['False']
#here no is zero so here it is know as unknown problem as in above dataset overcast is zero 

## Using **scikit-learn**

**Datasets**

I have  download a Diabetes dataset available here: https://www.kaggle.com/uciml/pima-indians-diabetes-database/data

---

In [None]:
df1 = pd.read_csv('https://github.com/jbrownlee/Datasets/raw/master/pima-indians-diabetes.data.csv', header=None)
df1.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 
              'SkinThickness', 'Insulin', 'BMI', 
              'DiabetesPedigreeFunction', 'Age', 'Outcome']

In [None]:
df1

In [None]:
cols = df1.columns
X = df1[cols[:-1]]
y = df1[cols[-1]]

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

## Using Gaussian

$$P(x|c_k) = \frac{1}{\sqrt{2\pi\sigma_k^c}} e^{-\frac{{(x-\mu_k)}^2}{2\sigma_k^2}}$$

In [None]:
gNB = GaussianNB()

In [None]:
gNB.fit(X, y)

In [None]:
gNB.score(X, y)

In [None]:
gNB.class_prior_

In [None]:
gNB.sigma_

## Using Multinomial

In [None]:
MultinomialNB().fit(X,y).score(X,y)

## I have tried with one more dataset

**Breast Cancer Dataset**
* Data: https://github.com/jbrownlee/Datasets/blob/master/breast-cancer.csv
* Description: https://github.com/jbrownlee/Datasets/blob/master/breast-cancer.names

In [None]:
from sklearn import datasets

In [None]:
df2 = pd.read_csv('https://github.com/jbrownlee/Datasets/raw/master/breast-cancer.csv', header=None)
df2.columns = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat', 'Class']

In [None]:
print (df2.shape)
df2 = df2.dropna()
print (df2.shape)

In [None]:
df2

In [None]:
cols2 = df2.columns
X2 = df2[cols2[:-1]]
y2 = df2[cols2[-1]]

In [None]:
mNB = MultinomialNB()
mNB.fit(X2, y2)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
le = LabelEncoder()
df2_ = df2.apply(le.fit_transform)

In [None]:
df2_

In [None]:
print (le.classes_)

In [None]:
cols2 = df2_.columns
X2 = df2_[cols2[:-1]]
y2 = df2_[cols2[-1]]

In [None]:
GaussianNB().fit(X2, y2).score(X2, y2)

In [None]:
MultinomialNB().fit(X2, y2).score(X2, y2)

# Naive Bayes in Sentiment Analysis

Dataset source: https://github.com/ApoorvP02121996/Sentiment-Analysis---Movie-Reviews/blob/master/Naive%20Bayes/training_set.csv

In [None]:
df3 = pd.read_csv('https://github.com/ApoorvP02121996/Sentiment-Analysis---Movie-Reviews/raw/master/Naive%20Bayes/training_set.csv')
df3.columns = ['target', 'text']

print (df3.shape)
df3.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [None]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer

In [None]:
df3.text = df3.text.apply(lambda e: e.decode(errors='ignore'))

In [None]:
train = df3[:4000]
test  = df3[4000:]

print train.shape, test.shape

In [None]:
vectorizer.fit(df3.text)
train_features = vectorizer.transform(train.text)
test_features  = vectorizer.transform(test.text)

## Multinomial

In [None]:
mNB3_1 = MultinomialNB().fit(features, df3.target)
print mNB3_1.score(train_features, train.target)
print mNB3_1.score(test_features, test.target)

In [None]:
mNB3_2 = MultinomialNB(alpha=0.0000001).fit(train_features, train.target)
print mNB3_2.score(train_features, train.target)
print mNB3_2.score(test_features, test.target)

In [None]:
mNB3_3 = MultinomialNB(alpha=10).fit(train_features, train.target)
print mNB3_3.score(train_features, train.target)
print mNB3_3.score(test_features, test.target)

## Gaussian

In [None]:
gNB3_1 = GaussianNB().fit(train_features.todense(), train.target)
print gNB3_1.score(train_features.todense(), train.target)
print gNB3_1.score(test_features.todense(), test.target)