### For Continues Features
* Use GaussianNB

In [2]:
from sklearn.naive_bayes import GaussianNB

In [3]:
from sklearn.datasets import make_classification

In [109]:
X,Y = make_classification(n_classes=3, n_samples=100000, n_clusters_per_class=2, n_informative=4, n_features=6)

In [111]:
X[:5]

array([[-0.7252983 , -2.37604446, -2.69548585,  3.20485336,  0.37871878,
         1.84781733],
       [-2.46003882, -1.52473638, -1.66227408,  0.41544361,  0.30373041,
        -0.82475792],
       [ 1.14271839,  1.35592485,  0.31596602,  0.01019456,  1.09032996,
         2.12904958],
       [-0.34940271,  0.5573758 , -1.095489  , -0.83037111, -2.64118976,
        -2.43427294],
       [-1.05687636, -3.1193188 ,  2.72940948,  0.83417105,  3.42915222,
         0.57690075]])

In [12]:
Y[:4]

array([0, 2, 2, 2])

In [14]:
clsf = GaussianNB()

In [18]:
clsf.fit(X[:700],Y[:700])

GaussianNB(priors=None)

In [20]:
clsf.score(X[800:],Y[800:])

0.65500000000000003

In [22]:
clsf.predict([[1,2,3,4,5,6],[4,5,5,6,5,2]])

array([2, 2])

In [33]:
import numpy as  np
a = np.where(Y == 1)

In [34]:
len(a[0])

333

In [36]:
import pandas as pd

In [68]:
p = pd.Series(Y)

In [69]:
newY = p.map(lambda x: 1 if x == 2 else x)

In [70]:
a = np.where(newY == 1)

In [71]:
len(a[0])

66664

In [47]:
newY[:5]

0    0
1    1
2    1
3    1
4    0
dtype: int64

In [72]:
clsf_new = GaussianNB(priors=[0.33,0.67])

In [73]:
clsf_new.fit(X,newY)

GaussianNB(priors=[0.33, 0.67])

In [74]:
clsf_new.score(X,newY)

0.85801000000000005

In [75]:
clsf = GaussianNB()
clsf.fit(X,newY)

GaussianNB(priors=None)

In [76]:
clsf.score(X,newY)

0.85738000000000003

### Dealing with Discrete & Count Features
* Text Processing

In [77]:
from sklearn.naive_bayes import MultinomialNB

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
cv = CountVectorizer()

In [80]:
text_data = np.array(['I love India. Oh India', 'India is best','Bhutan is beautiful'])

In [82]:
cv.fit_transform(text_data).toarray()

array([[0, 0, 0, 2, 0, 1, 1],
       [0, 1, 0, 1, 1, 0, 0],
       [1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [85]:
cv.get_feature_names()

['beautiful', 'best', 'bhutan', 'india', 'is', 'love', 'oh']

In [86]:
y = np.array([1,1,0])

In [88]:
mnb = MultinomialNB()

In [102]:
feature = cv.fit_transform(text_data)

In [103]:
mnb.fit(feature,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [104]:
text = 'India is an awesome place'

In [105]:
f = cv.transform([text])

In [107]:
mnb.predict(f)

array([1])

array([1, 1, 0])

### Naive Bayes for Binary Features

In [113]:
from sklearn.naive_bayes import BernoulliNB

In [115]:
feature = np.random.randint(2, size=(100,3))

In [116]:
target = np.random.randint(3,size=100)

In [118]:
clsf = BernoulliNB()

In [119]:
clsf.fit(feature,target)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

### Problem Solution - Identifying which category a text belongs to

In [139]:
twitter_data =  pd.read_csv('sentiment/Sentiment.csv')

In [140]:
twitter_data = twitter_data[['sentiment','text']]

### Text data cleanup

In [141]:
import re

In [142]:
twitter_data.text[:5]

0    RT @NancyLeeGrahn: How did everyone feel about...
1    RT @ScottWalker: Didn't catch the full #GOPdeb...
2    RT @TJMShow: No mention of Tamir Rice and the ...
3    RT @RobGeorge: That Carly Fiorina is trending ...
4    RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
Name: text, dtype: object

In [143]:
re.sub(r'@\w+','','ab @abc df')

'ab  df'

In [144]:
def f(t):
    r = re.sub(r'RT','',t)
    r = re.sub(r'@\w+:','',r)
    r = re.sub(r'[#@]','',r)
    return r

In [145]:
twitter_data.text = twitter_data.text.map(f)

In [148]:
twitter_data.groupby('sentiment').size()

sentiment
Negative    8493
Neutral     3142
Positive    2236
dtype: int64

In [149]:
from sklearn.pipeline import Pipeline

In [151]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder

In [152]:
mapper = DataFrameMapper([
    ('sentiment',LabelEncoder()),
    ('text',CountVectorizer(stop_words='english'))
])

In [155]:
data = mapper.fit_transform(twitter_data)

In [180]:
mnb = MultinomialNB(class_prior=[0.5,0.3,0.2])

In [181]:
data[:,1:]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [182]:
mnb.fit(data[:,1:],data[:,0])

MultinomialNB(alpha=1.0, class_prior=[0.5, 0.3, 0.2], fit_prior=True)

In [187]:
f = mapper.features[1][1].transform(['This is bad stuff'])

In [188]:
mnb.predict(f)

array([0], dtype=int64)

In [185]:
e = mapper.features[0][1]

In [186]:
e.classes_

array(['Negative', 'Neutral', 'Positive'], dtype=object)

#### Text Analysis of mail Data

In [189]:
from sklearn.datasets import fetch_20newsgroups

In [190]:
data = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
