##### In this notebook, we will explore two method
    1. using dense one hot
    2. using feature generation
    3. Naive bayes
    
Preprocessing is not covered in this notebook. 

In [7]:
import itertools
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import TreebankWordTokenizer

In [51]:
all_positive_tweets = [
    'I am happy because I am learning NLP',
    'I am happy not sad'
]

all_negative_tweets = [
    'I am sad I am not learning NLP',
    'I am sad not happy'
]

tweets = all_positive_tweets + all_negative_tweets
tweets

['I am happy because I am learning NLP',
 'I am happy not sad',
 'I am sad I am not learning NLP',
 'I am sad not happy']

## Method -1 : Dense One Hot Representation

    - create feature matrix
    - cons : not scalable, create problem when vocab size will increase.

In [52]:
tokenizer = TreebankWordTokenizer()
tweets_tokenized = [tokenizer.tokenize(sent) for sent in tweets]
tweets_tokenized

[['I', 'am', 'happy', 'because', 'I', 'am', 'learning', 'NLP'],
 ['I', 'am', 'happy', 'not', 'sad'],
 ['I', 'am', 'sad', 'I', 'am', 'not', 'learning', 'NLP'],
 ['I', 'am', 'sad', 'not', 'happy']]

In [53]:
vocab = list(set(list(itertools.chain.from_iterable(tweets_tokenized))))
print(vocab)

['I', 'am', 'NLP', 'not', 'because', 'happy', 'sad', 'learning']


In [54]:
idx2word =  dict(enumerate(vocab))
word2idx =  {w:i for i,w in idx2word.items()}

In [55]:
feature_mat = np.zeros(shape = (len(tweets),len(idx2word)))

In [56]:
for i,tweet_list in enumerate(tweets_tokenized):
    for w in tweet_list:
        feature_mat[i,word2idx[w]] = 1

In [57]:
pd.DataFrame(feature_mat, columns=word2idx, index=tweets)

Unnamed: 0,I,am,NLP,not,because,happy,sad,learning
I am happy because I am learning NLP,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
I am happy not sad,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
I am sad I am not learning NLP,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
I am sad not happy,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


## Method -2 : Feature Generation

    - here i am not doing any preprocessing.
    - generate fetaures [bias, freq_of_word_in_positive_tweets, freq_of_word_in_positive_tweets]

In [58]:
tokenizer = TreebankWordTokenizer()

print ("*"*20 + "  Positve Tweets  " + "*"*20)
pos_tweets = [tokenizer.tokenize(sent) for sent in all_positive_tweets]
print("tokens : ",pos_tweets)

pos_words = list(itertools.chain.from_iterable(pos_tweets))
print("words",pos_words)

pos_freq_dict = nltk.FreqDist(pos_words)
print("freq dict ", pos_freq_dict.items())

print("\n\n")
print ("*"*20 + "  Negative Tweets  " + "*"*20)
neg_tweets = [tokenizer.tokenize(sent) for sent in all_negative_tweets]
print("tokens : ",neg_tweets)

neg_words = list(itertools.chain.from_iterable(neg_tweets))
print("words",pos_words)

neg_freq_dict = nltk.FreqDist(neg_words)
print("freq dict ", neg_freq_dict.items())

print ("\n\n" + "*"*20 + "  Vocab  " + "*"*20)
print("Complete Vocab :", vocab)

********************  Positve Tweets  ********************
tokens :  [['I', 'am', 'happy', 'because', 'I', 'am', 'learning', 'NLP'], ['I', 'am', 'happy', 'not', 'sad']]
words ['I', 'am', 'happy', 'because', 'I', 'am', 'learning', 'NLP', 'I', 'am', 'happy', 'not', 'sad']
freq dict  dict_items([('I', 3), ('am', 3), ('happy', 2), ('because', 1), ('learning', 1), ('NLP', 1), ('not', 1), ('sad', 1)])



********************  Negative Tweets  ********************
tokens :  [['I', 'am', 'sad', 'I', 'am', 'not', 'learning', 'NLP'], ['I', 'am', 'sad', 'not', 'happy']]
words ['I', 'am', 'happy', 'because', 'I', 'am', 'learning', 'NLP', 'I', 'am', 'happy', 'not', 'sad']
freq dict  dict_items([('I', 3), ('am', 3), ('sad', 2), ('not', 2), ('learning', 1), ('NLP', 1), ('happy', 1)])


********************  Vocab  ********************
Complete Vocab : ['I', 'am', 'NLP', 'not', 'because', 'happy', 'sad', 'learning']


In [59]:
feature_mat = np.zeros(shape = (len(tweets), 3))

In [60]:
for i,t in enumerate(tweets_tokenized):
    feature_mat[i,0] = 1
    pos_freq = 0
    neg_freq = 0
    for w in set(t):
        pos_freq += pos_freq_dict[w]
        neg_freq += neg_freq_dict[w]
    feature_mat[i,1] = pos_freq
    feature_mat[i,2] = neg_freq

In [61]:
print("Pos dict ",pos_freq_dict.items())
print("Neg dict ",neg_freq_dict.items())

Pos dict  dict_items([('I', 3), ('am', 3), ('happy', 2), ('because', 1), ('learning', 1), ('NLP', 1), ('not', 1), ('sad', 1)])
Neg dict  dict_items([('I', 3), ('am', 3), ('sad', 2), ('not', 2), ('learning', 1), ('NLP', 1), ('happy', 1)])


In [62]:
pd.DataFrame(feature_mat, index=tweets, columns=['bias','pos_freq','neg_freq'])

Unnamed: 0,bias,pos_freq,neg_freq
I am happy because I am learning NLP,1.0,11.0,9.0
I am happy not sad,1.0,10.0,11.0
I am sad I am not learning NLP,1.0,10.0,12.0
I am sad not happy,1.0,10.0,11.0


You can apply logistic regression on above feature matrix

## Mehtod -3 : Naive Bayes

In [98]:
## Using pos_freq_dict and neg_freq_dict create dataframe
df = pd.DataFrame(columns=['pos','neg'], index= vocab)

In [99]:
for k,v in pos_freq_dict.items():
    df.loc[k,'pos'] = v
    
for k,v in neg_freq_dict.items():
    df.loc[k,'neg'] = v
    
df.fillna(0, inplace = True)

In [100]:
total_pos_freq = df.pos.sum()
total_neg_freq = df.neg.sum()

In [101]:
df['pos_prob'] = df.pos/total_pos_freq
df['neg_prob'] = df.neg/total_neg_freq

In [102]:
df

Unnamed: 0,pos,neg,pos_prob,neg_prob
I,3,3,0.230769,0.230769
am,3,3,0.230769,0.230769
NLP,1,1,0.076923,0.076923
not,1,2,0.076923,0.153846
because,1,0,0.076923,0.0
happy,2,1,0.153846,0.076923
sad,1,2,0.076923,0.153846
learning,1,1,0.076923,0.076923


###### If you want to calculate the prob of tweet

##### tweet = i am learning NLP => prob(i) * prob(am) * prob(learning) * prob(NLP)


    Ratio(tweet) = P(pos) / P(neg)

    if Ratio(tweet) >  1 then positive else negative sentiment
    
    
    Problem : Some words have zero probability that will create the prob. Use laplacian smoothing to remove zero prob terms.
    
    p(w) = (p(w,class) + 1) / f(class) + N(class)

In [103]:
# N_pos = > unique word in positive tweets
# N_neg => unique words in negative tweets
N_pos = len(pos_freq_dict)
N_neg = len(neg_freq_dict)
N_pos, N_neg

(8, 7)

In [104]:
df['pos_prob_after_laplacian'] = (df.pos + 1)/(total_pos_freq+N_pos)
df['neg_prob_after_laplacian'] = (df.neg + 1)/(total_neg_freq+N_neg)
df['ratio(pos/neg)'] = df.pos_prob_after_laplacian / df.neg_prob_after_laplacian

In [105]:
df

Unnamed: 0,pos,neg,pos_prob,neg_prob,pos_prob_after_laplacian,neg_prob_after_laplacian,ratio(pos/neg)
I,3,3,0.230769,0.230769,0.190476,0.2,0.952381
am,3,3,0.230769,0.230769,0.190476,0.2,0.952381
NLP,1,1,0.076923,0.076923,0.095238,0.1,0.952381
not,1,2,0.076923,0.153846,0.095238,0.15,0.634921
because,1,0,0.076923,0.0,0.095238,0.05,1.904762
happy,2,1,0.153846,0.076923,0.142857,0.1,1.428571
sad,1,2,0.076923,0.153846,0.095238,0.15,0.634921
learning,1,1,0.076923,0.076923,0.095238,0.1,0.952381


In [112]:
df = df.sort_values('ratio(pos/neg)')
df

Unnamed: 0,pos,neg,pos_prob,neg_prob,pos_prob_after_laplacian,neg_prob_after_laplacian,ratio(pos/neg)
not,1,2,0.076923,0.153846,0.095238,0.15,0.634921
sad,1,2,0.076923,0.153846,0.095238,0.15,0.634921
I,3,3,0.230769,0.230769,0.190476,0.2,0.952381
am,3,3,0.230769,0.230769,0.190476,0.2,0.952381
NLP,1,1,0.076923,0.076923,0.095238,0.1,0.952381
learning,1,1,0.076923,0.076923,0.095238,0.1,0.952381
happy,2,1,0.153846,0.076923,0.142857,0.1,1.428571
because,1,0,0.076923,0.0,0.095238,0.05,1.904762


##### As you can see from above table 
     
     - positive words => hsappy because
     - negative words => not sad 

In [115]:
np.log(df['ratio(pos/neg)'])

not        -0.454255
sad        -0.454255
I          -0.048790
am         -0.048790
NLP        -0.048790
learning   -0.048790
happy       0.356675
because     0.644357
Name: ratio(pos/neg), dtype: float64

### Below are the scale used to  calculate the sentiment

        p(w/pos)/p(w/neg) ==>  negative - neutral - positive
                                 0          1      infinity
                             
                             
                             
        log(p(w/pos)/p(w/neg)) ==>    negative       - neutral       - positive
                                      -infinity         0              infinity
                    