In [None]:
# homework 1 task 2
# contents of this document
# 1. TF-IDF features are generated
# two naive bayes classificators are trained (one based on word frequency vectors and one based on TF-IDF features) and compared

In [1]:
# data import

import pandas as pd
import os
here = os.path.dirname("C:/Users/Feli/Documents/NTHU/DataMining/Lab/DMLab1/NewDataset") if "__file__" in locals() else "."

files = [("amazon", "C:/Users/Feli/Documents/NTHU/DataMining/Lab/DMLab1/NewDataset/sentiment labelled sentences/amazon_cells_labelled.txt"),
         ("imdb", "C:/Users/Feli/Documents/NTHU/DataMining/Lab/DMLab1/NewDataset/sentiment labelled sentences/imdb_labelled.txt"),
         ("yelp", "C:/Users/Feli/Documents/NTHU/DataMining/Lab/DMLab1/NewDataset/sentiment labelled sentences/yelp_labelled.txt")]
    
dfs = []

# 3. data transformation: pandas dataframe

for provider, name in files:
    df = pd.read_csv(name, sep="\t")
    df.columns = ["sentence", "label"]
    df["provider"] = provider
    dfs.append(df)

data = pd.concat(dfs, axis=0)

# visualization of some records in table form
print(data.head(6))
print("shape", data.shape)

                                            sentence  label provider
0                        Good case, Excellent value.      1   amazon
1                             Great for the jawbone.      1   amazon
2  Tied to charger for conversations lasting more...      0   amazon
3                                  The mic is great.      1   amazon
4  I have to jiggle the plug to get it to line up...      0   amazon
5  If you have several dozen or several hundred c...      0   amazon
shape (2745, 3)


In [27]:
# 1. generation of TF-IDF features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()

# compute the frequency of the words in a sentence of the dataset
data_counts = count_vect.fit_transform(data.sentence)

In [3]:
# tf-idf scores
 
tfidf_transformer=TfidfTransformer()
tfidf_transformer.fit(data_counts)
 
tf_idf_vector=tfidf_transformer.transform(data_counts)
feature_names = count_vect.get_feature_names()

In [6]:
# print the tfidf values of the first document for visualization reasons
 
first_document_vector=tf_idf_vector[0]

# print the scores
df_first = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

# sort descending
df_first.sort_values(by=["tfidf"],ascending=False)

# some words and their tfidf weights of the first documents can be seen below

Unnamed: 0,tfidf
value,0.634657
case,0.511961
excellent,0.470853
good,0.336754
00,0.000000
...,...
fell,0.000000
feisty,0.000000
feet,0.000000
feels,0.000000


In [14]:
# 2. naive bayes classificators

In [17]:
# important imports for training naive bayes classificators

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# initialize classifier
mnb = MultinomialNB()
gnb = GaussianNB()

# according to https://hub.packtpub.com/implementing-3-naive-bayes-classifiers-in-scikit-learn/ 
# a multinomial naive bayes assumes to have feature vector where each element represents the number of times it appears 
# this is the case then training the classifier with word frequency features
# thus, a multinomial naive bayes is used to train the bayes classifyer based on word frequency vectors
# gaussian naive bayes is based on a continuous distribution and it’s suitable for more generic classification tasks
# thus, gaussian naive bayes is used to train the bayes classificator based on TF-IDF features

In [21]:
# train naive bayes classifier with word frequency features

count_vect = CountVectorizer()
data_counts = count_vect.fit_transform(data.sentence)

label_names_wf = ['positive', 'negative']
labels_wf = data['label']
feature_names_wf = count_vect.get_feature_names()
features_wf = data_counts

# split our data randomly in test and training data
train, test, train_labels, test_labels = train_test_split(features_wf,
                                                          labels_wf,
                                                          test_size=0.33,
                                                          random_state=42)
# train lassifier
model_wf = mnb.fit(train.toarray(), train_labels)
preds_wf = model_wf.predict(test.toarray())

# evaluate classifier

# accuracy = (TP+TN)/(TP+FP+TN+FN)
print("accuracy: ", accuracy_score(test_labels, preds_wf))

# precision = TP/(TP + FP)
print("precision: ", precision_score(test_labels, preds_wf))

# recall = TP/(TP+FN)
print("recall: ", recall_score(test_labels, preds_wf))


accuracy:  0.8123620309050773
precision:  0.8194130925507901
recall:  0.8013245033112583


In [19]:
# train naive bayes classifier with TF-IDF features

label_names_tf = ['positive', 'negative']
labels_tf = data['label']
feature_names_tf = feature_names # see[11]
features_tf = tf_idf_vector

# train and build classifier
train, test, train_labels, test_labels = train_test_split(features_tf,
                                                          labels_tf,
                                                          test_size=0.33,
                                                        random_state=42)
model_tf = gnb.fit(train.toarray(), train_labels)
preds_tf = model_tf.predict(test.toarray())

# evaluate classifier

# accuracy = (TP+TN)/(TP+FP+TN+FN)
print("accuracy: ", accuracy_score(test_labels, preds_tf))

# precision = TP/(TP + FP)
print("precision: ", precision_score(test_labels, preds_tf))

# recall = TP/(TP+FN)
print("recall: ", recall_score(test_labels, preds_tf))

accuracy:  0.6589403973509934
precision:  0.7033898305084746
recall:  0.5496688741721855


In [104]:
# as the recall of the second classiefier is much lower than the first one, the number of false negatives is much higher
# the precision value of the scoond classifier is about 0.1 lower. Thus, the number of false negatives is higher too
# the accuracy is lower, too. This means that the number of false classifications (FP + FN) is higher in the second classifier
# the first classifier (bayes classifier trained with the word frequency vector) leads to better results