<a href="https://colab.research.google.com/github/feliciahf/NLP-Project/blob/main/NBbigrams_Scikit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bigrams

## Importing Data

In [1]:
# mount Google Drive
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


In [2]:
# import file from Google Drive
import pandas as pd
df = pd.read_csv('/drive/My Drive/book32listing.csv',encoding='latin1', header=None)

In [3]:
# drop columns that are not needed
#df = pd.read_csv("book32listing.csv", encoding='latin1', header=None)
df1 = df[[3,6,5]] # only columns with titles and genres
df1.columns = ['title', 'genre', 'label']
print(df1)

                                                    title      genre  label
0                         Mom's Family Wall Calendar 2016  Calendars      3
1                         Doug the Pug 2016 Wall Calendar  Calendars      3
2       Moleskine 2016 Weekly Notebook, 12M, Large, Bl...  Calendars      3
3                 365 Cats Color Page-A-Day Calendar 2016  Calendars      3
4                    Sierra Club Engagement Calendar 2016  Calendars      3
...                                                   ...        ...    ...
207567  ADC the Map People Washington D.C.: Street Map...     Travel     29
207568  Washington, D.C., Then and Now: 69 Sites Photo...     Travel     29
207569  The Unofficial Guide to Washington, D.C. (Unof...     Travel     29
207570      Washington, D.C. For Dummies (Dummies Travel)     Travel     29
207571  Fodor's Where to Weekend Around Boston, 1st Ed...     Travel     29

[207572 rows x 3 columns]


## Preprocessing

In [4]:
# case collapsing
df1['title'] = df1.title.map(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
# remove punctuation
df1['title'] = df1.title.str.replace('[^\w\s]', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
# transform data into occurrences
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='char', ngram_range=(2, 2))
counts = count_vect.fit_transform(df1['title'])

In [7]:
# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

## Training NB Model

In [8]:
# split data into train (80%) and test (20%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df1['label'], test_size=0.2, random_state=69)

In [9]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

## Evaluating NB Model

In [10]:
# accuracy
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.3315187281705408


In [11]:
# compute overall accuracy, precision, recall, f1 scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print('Accuracy: ', accuracy_score(y_test, predicted))
print('Precision: ', precision_score(y_test, predicted, average='weighted', zero_division=1))
print('Recall: ', recall_score(y_test, predicted, average='weighted', zero_division=1))
print('F1:', f1_score(y_test, predicted, average='weighted'))

Accuracy:  0.3315187281705408
Precision:  0.4398405611931106
Recall:  0.3315187281705408
F1: 0.2921810583614228


In [12]:
# compute accuracy, precision, recall, f1 scores by genre

from sklearn.metrics import precision_recall_fscore_support as score

# precision, recall, fscore, support separated by genre
precision, recall, fscore, support = score(y_test, predicted)

df_acc = pd.DataFrame()
df_acc['precision']=pd.Series(precision)
df_acc['recall']=pd.Series(recall)
df_acc['fscore']=pd.Series(fscore)
df_acc['support']=pd.Series(support)

print(df_acc)
# indexing corresponds to genre ID/labels

    precision    recall    fscore  support
0    0.445455  0.037462  0.069111     1308
1    0.380952  0.009558  0.018648      837
2    0.303594  0.369347  0.333258     1990
3    0.813102  0.844000  0.828263      500
4    0.198008  0.544161  0.290360     2740
5    0.521739  0.240803  0.329519      598
6    0.562274  0.385281  0.457248     1617
7    0.646067  0.529649  0.582094     1737
8    0.384479  0.332309  0.356495     1953
9    0.317572  0.272180  0.293129     1826
10   0.442308  0.094845  0.156197      485
11   0.249296  0.443054  0.319063     2397
12   0.363372  0.186012  0.246063     1344
13   0.584071  0.097996  0.167832     1347
14   0.688172  0.394521  0.501524     1460
15   0.382482  0.169579  0.234978     1545
16   0.340187  0.659155  0.448767     2485
17   0.333333  0.002331  0.004630      429
18   0.000000  0.000000  0.000000      510
19   0.625000  0.007886  0.015576      634
20   0.516129  0.024242  0.046310      660
21   0.438400  0.186141  0.261326     1472
22   0.4736

  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[  49    0   80 ...  335    0    0]
 [   0    8   27 ...  219    0    0]
 [   2    0  735 ...  269    0    0]
 ...
 [   9    1   46 ... 2836    1    0]
 [   0    1   20 ...   53    2    0]
 [   0    0   68 ...   56    0    0]]


In [14]:
# examine class distribution
y_test.value_counts()

29    3663
4     2740
16    2485
11    2397
2     1990
8     1953
23    1858
9     1826
7     1737
6     1617
15    1545
27    1488
21    1472
14    1460
13    1347
12    1344
0     1308
26    1232
22     893
1      837
24     733
20     660
19     634
28     609
5      598
25     565
18     510
3      500
10     485
17     429
31     326
30     274
Name: label, dtype: int64