In [33]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.head())
print(test.head())

print(np.where(pd.isnull(train)))
print(np.where(pd.isnull(test)))

print(train.shape)
print(test.shape)

   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      1          0  is upset that he can't update his Facebook by ...
2      2          0  @Kenichan I dived many times for the ball. Man...
3      3          0    my whole body feels itchy and like its on fire 
4      4          0  @nationwideclass no, it's not behaving at all....
   Index  Sentiment                                               Text
0      0          1  @stellargirl I loooooooovvvvvveee my Kindle2. ...
1      1          1  Reading my kindle2...  Love it... Lee childs i...
2      2          1  Ok, first assesment of the #kindle2 ...it fuck...
3      3          1  @kenburbary You'll love your Kindle2. I've had...
4      4          1  @mikefish  Fair enough. But i have the Kindle2...
(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))
(1048575, 3)
(359, 3)


In [34]:
'''

Part 2 - Text Preprocessing

'''


# 1: lower-casing
train['Text'] = train['Text'].str.lower()
print(train.head())

test['Text'] = test['Text'].str.lower()
print(test.head())

# 2: remove digital numbers

import re #python regular expression library

train['Text'] = train['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
print(train.head())
print(test.head())


# 3: Remove urls
                    
train['Text'] = train['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
print(train.head())
print(test.head())


# 4: Remove username

train['Text'] = train['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
print(train.head())
print(test.head())



# 5: Remove special character and puncation 
train['Text'] = train['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
print(train.head())
print(test.head())



   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/2y1zl - awww, t...
1      1          0  is upset that he can't update his facebook by ...
2      2          0  @kenichan i dived many times for the ball. man...
3      3          0    my whole body feels itchy and like its on fire 
4      4          0  @nationwideclass no, it's not behaving at all....
   Index  Sentiment                                               Text
0      0          1  @stellargirl i loooooooovvvvvveee my kindle2. ...
1      1          1  reading my kindle2...  love it... lee childs i...
2      2          1  ok, first assesment of the #kindle2 ...it fuck...
3      3          1  @kenburbary you'll love your kindle2. i've had...
4      4          1  @mikefish  fair enough. but i have the kindle2...
   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/yzl - awww, tha...
1     

In [35]:
# 6: Stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english") #define stemming dict
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

train['Text'] = train['Text'].apply(lambda x : stem_sentences(x))
test['Text'] = test['Text'].apply(lambda x : stem_sentences(x))

print(train.head())
print(test.head())


# Get sample size

train = train.sample(frac=0.001, random_state=1)
# test = test.sample(frac=0.005, random_state=1)



   Index  Sentiment                                               Text
0      0          0  [awww, that, a, bummer, you, shoulda, got, dav...
1      1          0  [is, upset, that, he, can't, updat, his, faceb...
2      2          0  [i, dive, mani, time, for, the, ball, manag, t...
3      3          0  [my, whole, bodi, feel, itchi, and, like, it, ...
4      4          0  [no, it, not, behav, at, all, i'm, mad, whi, a...
   Index  Sentiment                                               Text
0      0          1  [i, loooooooovvvvvvee, my, kindl, not, that, t...
1      1          1  [read, my, kindl, love, it, lee, child, is, go...
2      2          1  [ok, first, asses, of, the, kindl, it, fuck, r...
3      3          1  [you'll, love, your, kindl, i'v, had, mine, fo...
4      4          1  [fair, enough, but, i, have, the, kindl, and, ...


In [36]:
'''

Part 3 - Linguistic Feature Extraction

'''

# 1. Bag of Words (train.csv)
import numpy as np

wordCount = {}
for tokens in train['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words = list(wordCount.keys())

bag_of_words = []

for tokens in train['Text']:
    bag_vector = np.zeros(len(unique_words))
    for words in tokens:
        for i, word in enumerate(unique_words):
            if word == words:
                bag_vector[i] += 1
    bag_of_words.append(bag_vector.tolist())


In [37]:
# 1. Bag of Words (test.csv)
import numpy as np

wordCount = {}
for tokens in test['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words2 = list(wordCount.keys())

print(unique_words2)
print(len(unique_words2))

bag_of_words_test = []

for tokens in test['Text']:
    bag_vector = np.zeros(len(unique_words2))
    for words in tokens:
        for i, word in enumerate(unique_words2):
            if word == words:
                bag_vector[i] += 1
    bag_of_words_test.append(bag_vector.tolist())


print(bag_of_words_test)

['i', 'loooooooovvvvvvee', 'my', 'kindl', 'not', 'that', 'the', 'dx', 'is', 'cool', 'but', 'fantast', 'in', 'it', 'own', 'right', 'read', 'love', 'lee', 'child', 'good', 'ok', 'first', 'asses', 'of', 'fuck', 'rock', "you'll", 'your', "i'v", 'had', 'mine', 'for', 'a', 'few', 'month', 'and', 'never', 'look', 'back', 'new', 'big', 'one', 'huge', 'no', 'need', 'remors', 'fair', 'enough', 'have', 'think', 'perfect', 'too', "i'm", 'quit', 'happi', 'with', 'this', 'economi', 'hate', 'aig', 'their', 'non', 'loan', 'given', 'ass', 'jqueri', 'best', 'friend', 'twitter', 'how', 'can', 'you', 'obama', 'he', 'make', 'joke', 'about', 'himself', 'firm', 'believ', 'pelosi', 'zero', 'desir', 'to', 'be', 'civil', 'charad', 'slogan', 'they', 'want', 'destroy', 'conservat', 'hous', 'correspond', 'dinner', 'was', 'last', 'night', 'whoopi', 'barbara', 'amp', 'sherri', 'went', 'got', 'stand', 'ovat', 'watchin', 'espn', 'jus', 'seen', 'nike', 'commer', 'puppet', 'lebron', 'sh', 't', 'hilari', 'lmao', 'dear', 

In [38]:
# 2. TF*IDF (train.csv)

# Find term-frequency: num of that word in sentence/ number of words in sentence

num_comments = len(train['Text'])
train_list = train['Text'].tolist()

# print(train_list)

df_tf = pd.DataFrame(np.zeros((num_comments, len(unique_words))), columns = unique_words)

for i in range(num_comments):
    for w in train_list[i]:
        df_tf[w][i] = df_tf[w][i] + (1/len(train_list[i]))

# print("Term Frequency: \n", df_tf)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf = {}

for w in unique_words:
    k = 0 

    for i in range(num_comments):
        if w in train_list[i]:
            k += 1
    
    idf[w] = np.log10(num_comments/k)

# print("IDF of: \n", idf)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf = df_tf.copy()
for w in unique_words:
    for i in range(num_comments):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]


# print("TF*DF of: \n", df_tf_idf)



In [39]:
# 2. TF*IDF (test.csv)

# Find term-frequency: num of that word/ number of words in sentence


num_comments = len(test['Text'])
test_list = test['Text'].tolist()

# print(test_list)

df_tf2 = pd.DataFrame(np.zeros((num_comments, len(unique_words2))), columns = unique_words2)

for i in range(num_comments):
    for w in test_list[i]:
        df_tf2[w][i] = df_tf2[w][i] + (1/len(test_list[i]))

# print("Term Frequency: \n", df_tf2)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf2 = {}

for w in unique_words2:
    k = 0 

    for i in range(num_comments):
        if w in test_list[i]:
            k += 1
    
    idf2[w] = np.log10(num_comments/k)

# print("IDF of: \n", idf2)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf2 = df_tf2.copy()
for w in unique_words2:
    for i in range(num_comments):
        df_tf_idf2[w][i] = df_tf2[w][i] * idf2[w]


# print("TF*DF of: \n", df_tf_idf2)



In [40]:
# 3: Word2Vec (train.csv)
import gensim
from gensim.models import Word2Vec

train_list = train['Text'].tolist()

model = Word2Vec(train_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")


print(model)


word_model_train = []
vocab = model.wv.key_to_index.keys()
# word_model_train.append(vocab)

for sentence in train_list:
    temp = []
    for val in vocab:
        if val in sentence:
            temp.append(sum(model.wv[val]))
        else:
            temp.append(0)
    word_model_train.append(temp)

print(len(word_model_train[0]))
print(len(train_list))



Word2Vec<vocab=2680, vector_size=100, alpha=0.025>
2680
1049


In [41]:
# 3: Word2Vec (test.csv)
import gensim
from gensim.models import Word2Vec

test_list = test['Text'].tolist()

model = Word2Vec(test_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vectest.model")

print(model)

word_model_test = []
vocab = model.wv.key_to_index.keys()
# word_model_test.append(vocab)

for sentence in test_list:
    temp = []
    for val in vocab:
        if val in sentence:
            temp.append(sum(model.wv[val]))
        else:
            temp.append(0)
    word_model_test.append(temp)


Word2Vec<vocab=1414, vector_size=100, alpha=0.025>


In [55]:
#Combined Bag_of_words into dataframe for model training
df = pd.DataFrame(bag_of_words, columns = unique_words)
df2 = pd.DataFrame(bag_of_words_test, columns = unique_words2)

df_merged_bag = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_bag.fillna(0, inplace=True)
df_merged2_bag = pd.concat([test, train], axis=0).reset_index(drop=True)

# print(df_merged_bag.shape)
# print(df_merged2_bag.shape)


In [56]:
#Combined TF*IDF into dataframe for model training
df = pd.DataFrame(df_tf_idf, columns = unique_words)
df2 = pd.DataFrame(df_tf_idf2, columns = unique_words2)

df_merged_tf_idf = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_tf_idf.fillna(0, inplace=True)
df_merged2_tf_idf = pd.concat([test, train], axis=0).reset_index(drop=True)


In [57]:
#Combined word2Vec into dataframe for model training
df = pd.DataFrame(word_model_train, columns = unique_words)
df2 = pd.DataFrame(word_model_test, columns = unique_words2)

print(df.shape)
print(df2.shape)

df_merged_word2vec = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_word2vec.fillna(0, inplace=True)
df_merged2_word2vec = pd.concat([test, train], axis=0).reset_index(drop=True)

print(df_merged_word2vec.shape)
print(df_merged2_word2vec.shape)


(1049, 2680)
(359, 1414)
(1408, 3308)
(1408, 3)


In [58]:
'''

Part 4 - Sentiment Classification Model

'''

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1: Classification Model for Bag_of_Words

y = df_merged2_bag['Sentiment'].to_numpy()
X = df_merged_bag


# Data Scaling
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_bag_train, x_bag_test,  y_bag_train, y_bag_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_bag_train, y_bag_train)
svc.fit(x_bag_train, y_bag_train)
nbc.fit(x_bag_train, y_bag_train)
rfc.fit(x_bag_train, y_bag_train)


In [59]:

y_lc_predicted = lc.predict(x_bag_test)
print(classification_report(y_bag_test, y_lc_predicted))
print(lc.score(x_bag_test,y_bag_test))

y_svc_predicted = svc.predict(x_bag_test)
print(classification_report(y_bag_test, y_svc_predicted))
print(svc.score(x_bag_test,y_bag_test))

y_nbc_predicted = nbc.predict(x_bag_test)
print(classification_report(y_bag_test, y_nbc_predicted))
print(nbc.score(x_bag_test,y_bag_test))

y_rfc_predicted = rfc.predict(x_bag_test)
print(classification_report(y_bag_test, y_rfc_predicted))
print(rfc.score(x_bag_test,y_bag_test))


              precision    recall  f1-score   support

           0       0.71      0.74      0.72       308
           1       0.20      0.17      0.19       115

    accuracy                           0.59       423
   macro avg       0.45      0.46      0.46       423
weighted avg       0.57      0.59      0.58       423

0.5886524822695035


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.73      1.00      0.84       308
           1       0.00      0.00      0.00       115

    accuracy                           0.73       423
   macro avg       0.36      0.50      0.42       423
weighted avg       0.53      0.73      0.61       423

0.7281323877068558
              precision    recall  f1-score   support

           0       0.70      0.55      0.62       308
           1       0.24      0.37      0.29       115

    accuracy                           0.50       423
   macro avg       0.47      0.46      0.45       423
weighted avg       0.58      0.50      0.53       423

0.5035460992907801
              precision    recall  f1-score   support

           0       0.73      0.94      0.82       308
           1       0.26      0.06      0.10       115

    accuracy                           0.70       423
   macro avg       0.49      0.50      0.46       423
weighted avg       0.60      0.70   

In [62]:
# 2: Classification Model for TF*IDF

y = df_merged2_tf_idf['Sentiment'].to_numpy()
X = df_merged_tf_idf


# Data Scaling
# scale = preprocessing.MinMaxScaler()
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_tf_idf_train, x_tf_idf_test,  y_tf_idf_train, y_tf_idf_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_tf_idf_train, y_tf_idf_train)
svc.fit(x_tf_idf_train, y_tf_idf_train)
nbc.fit(x_tf_idf_train, y_tf_idf_train)
rfc.fit(x_tf_idf_train, y_tf_idf_train)


In [63]:
y_lc_predicted = lc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_lc_predicted))
print(lc.score(x_tf_idf_test,y_tf_idf_test))

y_svc_predicted = svc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_svc_predicted))
print(svc.score(x_tf_idf_test,y_tf_idf_test))

y_nbc_predicted = nbc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_nbc_predicted))
print(nbc.score(x_tf_idf_test,y_tf_idf_test))

y_rfc_predicted = rfc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_rfc_predicted))
print(rfc.score(x_tf_idf_test,y_tf_idf_test))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       298
           1       0.32      0.26      0.28       125

    accuracy                           0.62       423
   macro avg       0.51      0.51      0.51       423
weighted avg       0.59      0.62      0.60       423

0.6170212765957447


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.70      1.00      0.83       298
           1       0.00      0.00      0.00       125

    accuracy                           0.70       423
   macro avg       0.35      0.50      0.41       423
weighted avg       0.50      0.70      0.58       423

0.7044917257683215
              precision    recall  f1-score   support

           0       0.69      0.57      0.63       298
           1       0.27      0.38      0.32       125

    accuracy                           0.52       423
   macro avg       0.48      0.48      0.47       423
weighted avg       0.57      0.52      0.54       423

0.5177304964539007
              precision    recall  f1-score   support

           0       0.70      0.95      0.81       298
           1       0.21      0.03      0.06       125

    accuracy                           0.68       423
   macro avg       0.46      0.49      0.43       423
weighted avg       0.56      0.68   

In [64]:
# 2: Classification Model for Word2Vec


y = df_merged2_word2vec['Sentiment'].to_numpy()
X = df_merged_word2vec


# Data Scaling
# scale = preprocessing.MinMaxScaler()
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_word2vec_train, x_word2vec_test,  y_word2vec_train, y_word2vec_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_word2vec_train, y_word2vec_train)
svc.fit(x_word2vec_train, y_word2vec_train)
nbc.fit(x_word2vec_train, y_word2vec_train)
rfc.fit(x_word2vec_train, y_word2vec_train)


In [65]:
y_lc_predicted = lc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_lc_predicted))
print(lc.score(x_word2vec_test,y_word2vec_test))

y_svc_predicted = svc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_svc_predicted))
print(svc.score(x_word2vec_test,y_word2vec_test))

y_nbc_predicted = nbc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_nbc_predicted))
print(nbc.score(x_word2vec_test,y_word2vec_test))

y_rfc_predicted = rfc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_rfc_predicted))
print(rfc.score(x_word2vec_test,y_word2vec_test))

              precision    recall  f1-score   support

           0       0.75      0.82      0.78       301
           1       0.41      0.31      0.35       122

    accuracy                           0.67       423
   macro avg       0.58      0.56      0.57       423
weighted avg       0.65      0.67      0.66       423

0.6713947990543735


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.71      1.00      0.83       301
           1       0.00      0.00      0.00       122

    accuracy                           0.71       423
   macro avg       0.36      0.50      0.42       423
weighted avg       0.51      0.71      0.59       423

0.7115839243498818
              precision    recall  f1-score   support

           0       0.72      0.60      0.66       301
           1       0.30      0.42      0.35       122

    accuracy                           0.55       423
   macro avg       0.51      0.51      0.50       423
weighted avg       0.60      0.55      0.57       423

0.5508274231678487
              precision    recall  f1-score   support

           0       0.71      0.96      0.82       301
           1       0.32      0.05      0.09       122

    accuracy                           0.70       423
   macro avg       0.51      0.50      0.45       423
weighted avg       0.60      0.70   