### Gaussian Naive Bayes (GaussianNB)

* Estimator: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
* Dataset: https://www.kaggle.com/utathya/imdb-review-dataset

In [128]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 

In [129]:
imdb_df = pd.read_csv('./datasets/imdb_dataset.csv', encoding="ISO-8859-1")

In [130]:
imdb_df.shape

(50000, 2)

In [131]:
imdb_df = imdb_df.sample(10000, replace=False)

In [132]:
imdb_df.describe()

Unnamed: 0,Review,Label
count,10000,10000
unique,9989,2
top,A friend and I went to see this movie. We have...,neg
freq,2,5099


In [133]:
X = imdb_df['Review']

Y = imdb_df['Label']

In [134]:
X.head()

30903    There is one really good scene in Faat Kine. T...
42763    WWE was in need of a saviour as Wrestlemania 1...
28619    The cast is OK. The script is awkward at times...
47138    Barbra Streisand is a tour de force in this Ho...
49756    Peaches is truly a marvelous film. I write thi...
Name: Review, dtype: object

In [135]:
Y.head()

30903    neg
42763    pos
28619    neg
47138    pos
49756    pos
Name: Label, dtype: object

In [136]:
count_vectorizer = CountVectorizer()

transformed_vector = count_vectorizer.fit_transform(X)

In [137]:
transformed_vector.shape

(10000, 52190)

In [138]:
print(transformed_vector[0])

  (0, 37635)	1
  (0, 46715)	1
  (0, 13549)	1
  (0, 48275)	1
  (0, 50423)	1
  (0, 22363)	1
  (0, 44583)	1
  (0, 40868)	1
  (0, 29144)	1
  (0, 43044)	1
  (0, 51044)	1
  (0, 34237)	1
  (0, 44380)	2
  (0, 33765)	1
  (0, 9922)	1
  (0, 31863)	1
  (0, 20653)	1
  (0, 47257)	1
  (0, 32041)	1
  (0, 17026)	1
  (0, 38941)	1
  (0, 47068)	1
  (0, 2013)	2
  (0, 34310)	1
  (0, 16278)	1
  :	:
  (0, 16521)	2
  (0, 21529)	2
  (0, 43679)	1
  (0, 46618)	1
  (0, 4619)	2
  (0, 1447)	2
  (0, 2262)	6
  (0, 51321)	2
  (0, 2445)	2
  (0, 51230)	5
  (0, 2869)	1
  (0, 2192)	3
  (0, 19246)	1
  (0, 8013)	3
  (0, 46913)	1
  (0, 46403)	31
  (0, 25635)	5
  (0, 16501)	5
  (0, 23136)	10
  (0, 40361)	7
  (0, 19745)	1
  (0, 37464)	1
  (0, 32666)	3
  (0, 24271)	9
  (0, 46459)	4


In [139]:
tfidf_transformer = TfidfTransformer()

tfidf_vector = tfidf_transformer.fit_transform(transformed_vector)

In [140]:
print(tfidf_vector[0])

  (0, 51565)	0.03791793387699295
  (0, 51479)	0.03126192371356777
  (0, 51438)	0.03738663519197124
  (0, 51321)	0.06405687489600245
  (0, 51246)	0.029145930538354462
  (0, 51244)	0.04077623973301778
  (0, 51230)	0.06216902332520747
  (0, 51100)	0.048915539614756656
  (0, 51044)	0.021821847854236267
  (0, 50911)	0.03271283140199068
  (0, 50804)	0.023870729803745806
  (0, 50518)	0.021933671894449235
  (0, 50465)	0.02603784846427607
  (0, 50423)	0.013044150115643693
  (0, 49825)	0.04169103131750947
  (0, 49142)	0.01856731818227796
  (0, 49079)	0.034027596841258854
  (0, 48701)	0.036653404088178757
  (0, 48642)	0.05154692022508718
  (0, 48275)	0.05174607900525253
  (0, 48116)	0.07210428599935079
  (0, 48082)	0.05009399779331891
  (0, 47835)	0.03236602232334837
  (0, 47257)	0.04389743417335611
  (0, 47099)	0.06410863211746622
  :	:
  (0, 4991)	0.028684053624532455
  (0, 4619)	0.047255714154623964
  (0, 4580)	0.03684579687234104
  (0, 4531)	0.021230329065117338
  (0, 4470)	0.0433716108170501

In [141]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector, Y, test_size = 0.2)

In [142]:
x_train.shape, x_test.shape

((8000, 52190), (2000, 52190))

In [143]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [144]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred,normalize=True)
    num_acc = accuracy_score(y_test, y_pred,normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [145]:
clf = GaussianNB().fit(x_train.toarray(), y_train)

In [146]:
y_pred = clf.predict(x_test.toarray())

In [147]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1283
accuracy_score :  0.6415
precision_score :  0.6420335341542481
recall_score :  0.6415


In [148]:
y_test = np.array(y_test)

In [149]:
pred_results = pd.DataFrame({'y_test': pd.Series(y_test),
                             'y_pred': pd.Series(y_pred)})

pred_results.sample(5)

Unnamed: 0,y_test,y_pred
1734,neg,pos
1840,pos,pos
1130,pos,neg
1804,neg,neg
1893,neg,neg
