In [None]:
import math
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
doc_a = "The cat sat on my face"
doc_b = "The dog sat on my bed" 

In [None]:
bow_a = doc_a
bow_b = doc_b

In [None]:
bow_b

### Merge two sentences

In [None]:
word_set = set(bow_a).union(set(bow_b))

In [None]:
word_set

### Convert to dictionary and set initial values to 0

In [None]:
word_dict_a = dict.fromkeys(word_set, 0) 
word_dict_b = dict.fromkeys(word_set, 0) 

### Get count of words per each sentence

In [None]:
for word in bow_a:
    word_dict_a[word]+=1
    
for word in bow_b:
    word_dict_b[word]+=1

In [None]:
word_dict_b

### Converting to Dataframe

In [None]:
pd.DataFrame([word_dict_a, word_dict_b])

### Compute Term Frequency

![alt text](tf.png "Title")

In [None]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count/float(bow_count)
    
    return tf_dict

In [None]:
### bow_a = The cat sat on my face

tf_bow_a = (word_dict_a, bow_a)
tf_bow_a

In [None]:
### The dog sat on my bed

tf_bow_b = (word_dict_b, bow_b)
tf_bow_b

### Compute IDF

![alt text](idf.png "Title")

In [None]:
def computeIDF(doc_list):
    idf_dict = {}
    
    number_of_items = len(doc_list)  ### 2
    
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
                
    print(idf_dict)
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(number_of_items / float(val))
        
    return idf_dict  

In [None]:
idfs = ([word_dict_a, word_dict_b])
idfs

### TFIDF

![alt text](tfidf.png "Title")

In [None]:
def computeTFIDF(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val*idfs[word]
        
    return tfidf

In [None]:
tfidf_bow_a = (tf_bow_a, idfs)
tfidf_bow_b = (tf_bow_b, idfs)

In [None]:
pd.DataFrame([tfidf_bow_a, tfidf_bow_b])

### Using Scikit Learn Library

In [None]:
# load dataset
df = pd.read_csv('twitter_training.csv', header=None, usecols=[2,3])
df.head()

In [None]:
# rename columns
df.columns = ['Sentiment', 'Text']

# remove missing values
df = df.dropna()

# encode target label
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

# establish input and output
X = list(df['Text'])
y = list(df['Sentiment'])

# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# create a bag of words for with unigrams and bigrams
tfidf = (sublinear_tf=True,norm='l2',encoding='utf-8',ngram_range=(1,1), 
                        stop_words='english')

# convert training data to bag of words
X_train_cv = tfidf.fit_transform(X_train)
X_test_cv = tfidf.transform(X_test)

# train naive bayes classifier
clf = MultinomialNB()
clf.fit(X_train_cv, y_train)

# create predictions
y_pred = clf.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

In [None]:
for N in range(1,11):
    
    # convert training data to bag of words
    tfidf = (sublinear_tf=True,norm='l2',encoding='utf-8',ngram_range=(1,N), 
                        stop_words='english')
    X_train_cv = tfidf.fit_transform(X_train)
    X_test_cv = tfidf.transform(X_test)
    
    # train model and generate predictions
    clf = MultinomialNB()
    clf.fit(X_train_cv, y_train)
    y_pred = clf.predict(X_test_cv)
    
    # compute f-1 score
    score = np.round(f1_score(y_test, y_pred, average='micro'), 4)
    print('F-1 score of model with n-gram range of {}: {}'.format((1,N), score))