In [61]:
import math
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
doc_a = "The cat sat on my face"
doc_b = "The dog sat on my bed" 

In [3]:
bow_a = doc_a.split(" ")
bow_b = doc_b.split(" ")

In [4]:
bow_b

['The', 'dog', 'sat', 'on', 'my', 'bed']

### Merge two sentences

In [5]:
word_set = set(bow_a).union(set(bow_b))

In [6]:
word_set

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

### Convert to dictionary and set initial values to 0

In [7]:
word_dict_a = dict.fromkeys(word_set, 0) 
word_dict_b = dict.fromkeys(word_set, 0) 

### Get count of words per each sentence

In [8]:
for word in bow_a:
    word_dict_a[word]+=1
    
for word in bow_b:
    word_dict_b[word]+=1

In [9]:
word_dict_b

{'bed': 1, 'my': 1, 'cat': 0, 'on': 1, 'face': 0, 'sat': 1, 'The': 1, 'dog': 1}

### Converting to Dataframe

In [11]:
pd.DataFrame([word_dict_a, word_dict_b])

Unnamed: 0,bed,my,cat,on,face,sat,The,dog
0,0,1,1,1,1,1,1,0
1,1,1,0,1,0,1,1,1


### Compute Term Frequency

![alt text](tf.png "Title")

In [21]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count/float(bow_count)
    
    return tf_dict

In [1]:
1/6

0.16666666666666666

In [23]:
### bow_a = The cat sat on my face

tf_bow_a = computeTF(word_dict_a, bow_a)
tf_bow_a

{'bed': 0.0,
 'my': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'face': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'The': 0.16666666666666666,
 'dog': 0.0}

In [22]:
### The dog sat on my bed

tf_bow_b = computeTF(word_dict_b, bow_b)
tf_bow_b

{'bed': 0.16666666666666666,
 'my': 0.16666666666666666,
 'cat': 0.0,
 'on': 0.16666666666666666,
 'face': 0.0,
 'sat': 0.16666666666666666,
 'The': 0.16666666666666666,
 'dog': 0.16666666666666666}

### Compute IDF

![alt text](idf.png "Title")

In [40]:
def computeIDF(doc_list):
    idf_dict = {}
    
    number_of_items = len(doc_list)  ### 2
    
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
                
    print(idf_dict)
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(number_of_items / float(val))
        
    return idf_dict  

In [42]:
idfs = computeIDF([word_dict_a, word_dict_b])
idfs

{'bed': 1, 'my': 2, 'cat': 1, 'on': 2, 'face': 1, 'sat': 2, 'The': 2, 'dog': 1}


{'bed': 0.3010299956639812,
 'my': 0.0,
 'cat': 0.3010299956639812,
 'on': 0.0,
 'face': 0.3010299956639812,
 'sat': 0.0,
 'The': 0.0,
 'dog': 0.3010299956639812}

### TFIDF

![alt text](tfidf.png "Title")

In [43]:
def computeTFIDF(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val*idfs[word]
        
    return tfidf

In [47]:
tfidf_bow_a = computeTFIDF(tf_bow_a, idfs)
tfidf_bow_b = computeTFIDF(tf_bow_b, idfs)

In [48]:
pd.DataFrame([tfidf_bow_a, tfidf_bow_b])

Unnamed: 0,bed,my,cat,on,face,sat,The,dog
0,0.0,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0
1,0.050172,0.0,0.0,0.0,0.0,0.0,0.0,0.050172


### Using Scikit Learn Library

In [58]:
# load dataset
df = pd.read_csv('twitter_training.csv', header=None, usecols=[2,3])
df.head()

Unnamed: 0,2,3
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [62]:
# rename columns
df.columns = ['Sentiment', 'Text']

# remove missing values
df = df.dropna()

# encode target label
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

# establish input and output
X = list(df['Text'])
y = list(df['Sentiment'])

# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [63]:
# create a bag of words for with unigrams and bigrams
tfidf = TfidfVectorizer(sublinear_tf=True,norm='l2',encoding='utf-8',ngram_range=(1,1), 
                        stop_words='english')

# convert training data to bag of words
X_train_cv = tfidf.fit_transform(X_train)
X_test_cv = tfidf.transform(X_test)

# train naive bayes classifier
clf = MultinomialNB()
clf.fit(X_train_cv, y_train)

# create predictions
y_pred = clf.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7313


In [66]:
for N in range(1,11):
    
    # convert training data to bag of words
    tfidf = TfidfVectorizer(sublinear_tf=True,norm='l2',encoding='utf-8',ngram_range=(1,N), 
                        stop_words='english')
    X_train_cv = tfidf.fit_transform(X_train)
    X_test_cv = tfidf.transform(X_test)
    
    # train model and generate predictions
    clf = MultinomialNB()
    clf.fit(X_train_cv, y_train)
    y_pred = clf.predict(X_test_cv)
    
    # compute f-1 score
    score = np.round(f1_score(y_test, y_pred, average='micro'), 4)
    print('F-1 score of model with n-gram range of {}: {}'.format((1,N), score))

F-1 score of model with n-gram range of (1, 1): 0.7313
F-1 score of model with n-gram range of (1, 2): 0.8324
F-1 score of model with n-gram range of (1, 3): 0.8519
F-1 score of model with n-gram range of (1, 4): 0.8529
F-1 score of model with n-gram range of (1, 5): 0.8516
F-1 score of model with n-gram range of (1, 6): 0.8482
F-1 score of model with n-gram range of (1, 7): 0.8451
F-1 score of model with n-gram range of (1, 8): 0.8424
F-1 score of model with n-gram range of (1, 9): 0.8404
F-1 score of model with n-gram range of (1, 10): 0.8375
