In [None]:
# SLP3 Chapter 4, page 7.
# https://web.stanford.edu/~jurafsky/slp3/4.pdf

train_string = """-,just plain boring
-,entirely predictable and lacks energy
-,no surprises and very few laughs
+,very powerful
+,the most fun film of the summer
"""

f = open('train.csv', 'w')
f.writelines(train_string)
f.close()

In [None]:
import pandas as pd
data = pd.read_csv('train.csv',names=('sentiment', 'content'))
data.head()

Unnamed: 0,sentiment,content
0,-,just plain boring
1,-,entirely predictable and lacks energy
2,-,no surprises and very few laughs
3,+,very powerful
4,+,the most fun film of the summer


In [None]:
# SLP3 Chapter 4, page 6, Figure 4-2.
# https://web.stanford.edu/~jurafsky/slp3/4.pdf

# word tokenization
data['document'] = data['content'].apply(str.split)
# sentiment labels as classes
data['class'] = data['sentiment']
data.head()

Unnamed: 0,sentiment,content,document,class
0,-,just plain boring,"[just, plain, boring]",-
1,-,entirely predictable and lacks energy,"[entirely, predictable, and, lacks, energy]",-
2,-,no surprises and very few laughs,"[no, surprises, and, very, few, laughs]",-
3,+,very powerful,"[very, powerful]",+
4,+,the most fun film of the summer,"[the, most, fun, film, of, the, summer]",+


In [None]:
# D = set of all documents (train data)
D = data['document']

In [None]:
# C = set of all classes
C = data['class'].unique().tolist()
print(C)

['-', '+']


# function TRAIN NAIVE BAYES(D, C) returns log P(c) and log P(w|c)

## Calculate P(c) terms

In [None]:
# Ndoc = number of documents in D
Ndoc = len(D)
print(Ndoc)

0


In [None]:
# Nc = number of documents from D in class c
Nc = data['class'].value_counts()
print(Nc)

-    3
+    2
Name: class, dtype: int64


In [None]:
# logprior[c] <- log(Nc/Ndoc)
import numpy as np
logprior = np.log(NC/Ndoc)
print(logprior)

-    3
+    2
Name: class, dtype: int64


In [None]:
# V <- vocabulary of D
V = []
for d in D:
  for w in d:
    if w in V:
      continue
    V.append(w)

print(len(V))
print(V[:5])

0
[]


In [None]:
# bigdoc[c] <- append(d) for d ∈ D with class c
bigdoc = D.groupby(data['class']).sum()
print(bigdoc)

class
+    [very, powerful, the, most, fun, film, of, the...
-    [just, plain, boring, entirely, predictable, a...
Name: document, dtype: object


## Calculate P(w|c) terms

In [None]:
# count(w,c) <- # of occurrences of w in bigdoc[c]
from collections import Counter
count = bigdoc.apply(Counter)
print(count)

In [None]:
# loglikelihood[w,c] ← log (count(w,c) + 1)/(∑_{w′in V} (count (w′,c) + 1))
from collections import defaultdict
#loglikelihood[c][w]: dictionary of dictionaries
loglikelihood = defaultdict(dict)
for c in C:
  for w in V:
    loglikelihood[c][w] = np.log((count[c][w]+1)/(sum(count[c].values())+len(V)))

loglikelihood = pd.DataFrame(loglikelihood)
loglikelihood.head(20)

# function TEST NAIVE BAYES(testdoc, logprior, loglikelihood, C, V) returns best c

In [None]:
# test
testdoc = 'predictable with no fun'.split()

# sum[c] ← logprior[c]
sums = logprior.copy()
for c in C:
  # word ← testdoc[i]
  for word in testdoc:
    if word in V:
      # sum[c] ← sum[c]+ loglikelihood[word,c]
     sum[c]+= logkikelihood[c][word]

print(sums)

-    3
+    2
Name: class, dtype: int64


In [None]:
# argmax_c sum[c]
print(sums.idxmax())

-
