In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.head())
print(test.head())

print(np.where(pd.isnull(train)))
print(np.where(pd.isnull(test)))

print(train.shape)
print(test.shape)

In [None]:
'''

Part 2 - Text Preprocessing

'''


# 1: lower-casing
train['Text'] = train['Text'].str.lower()
print(train.head())

test['Text'] = test['Text'].str.lower()
print(test.head())

# 2: remove digital numbers

import re #python regular expression library

train['Text'] = train['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
print(train.head())
print(test.head())


# 3: Remove urls
                    
train['Text'] = train['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
print(train.head())
print(test.head())


# 4: Remove username

train['Text'] = train['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
print(train.head())
print(test.head())



# 5: Remove special character and puncation 
train['Text'] = train['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
print(train.head())
print(test.head())



In [None]:
# 6: Stemming
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english") #define stemming dict
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

train['Text'] = train['Text'].apply(lambda x : stem_sentences(x))
test['Text'] = test['Text'].apply(lambda x : stem_sentences(x))

print(train.head())
print(test.head())


# Get sample size

train = train.sample(frac=0.001, random_state=1)
# test = test.sample(frac=0.005, random_state=1)



In [None]:
'''

Part 3 - Linguistic Feature Extraction

'''

# 1. Bag of Words (train.csv)
import numpy as np

wordCount = {}
for tokens in train['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words = list(wordCount.keys())

bag_of_words = []

for tokens in train['Text']:
    bag_vector = np.zeros(len(unique_words))
    for words in tokens:
        for i, word in enumerate(unique_words):
            if word == words:
                bag_vector[i] += 1
    bag_of_words.append(bag_vector.tolist())


In [None]:
# 1. Bag of Words (test.csv)
import numpy as np

wordCount = {}
for tokens in test['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words2 = list(wordCount.keys())

print(unique_words2)
print(len(unique_words2))

bag_of_words_test = []

for tokens in test['Text']:
    bag_vector = np.zeros(len(unique_words2))
    for words in tokens:
        for i, word in enumerate(unique_words2):
            if word == words:
                bag_vector[i] += 1
    bag_of_words_test.append(bag_vector.tolist())


print(bag_of_words_test)

In [None]:
# 2. TF*IDF (train.csv)

# Find term-frequency: num of that word in sentence/ number of words in sentence

num_comments = len(train['Text'])
train_list = train['Text'].tolist()

# print(train_list)

df_tf = pd.DataFrame(np.zeros((num_comments, len(unique_words))), columns = unique_words)

for i in range(num_comments):
    for w in train_list[i]:
        df_tf[w][i] = df_tf[w][i] + (1/len(train_list[i]))

# print("Term Frequency: \n", df_tf)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf = {}

for w in unique_words:
    k = 0 

    for i in range(num_comments):
        if w in train_list[i]:
            k += 1
    
    idf[w] = np.log10(num_comments/k)

# print("IDF of: \n", idf)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf = df_tf.copy()
for w in unique_words:
    for i in range(num_comments):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]


# print("TF*DF of: \n", df_tf_idf)



In [None]:
# 2. TF*IDF (test.csv)

# Find term-frequency: num of that word/ number of words in sentence


num_comments = len(test['Text'])
test_list = test['Text'].tolist()

# print(test_list)

df_tf2 = pd.DataFrame(np.zeros((num_comments, len(unique_words2))), columns = unique_words2)

for i in range(num_comments):
    for w in test_list[i]:
        df_tf2[w][i] = df_tf2[w][i] + (1/len(test_list[i]))

# print("Term Frequency: \n", df_tf2)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf2 = {}

for w in unique_words2:
    k = 0 

    for i in range(num_comments):
        if w in test_list[i]:
            k += 1
    
    idf2[w] = np.log10(num_comments/k)

# print("IDF of: \n", idf2)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf2 = df_tf2.copy()
for w in unique_words2:
    for i in range(num_comments):
        df_tf_idf2[w][i] = df_tf2[w][i] * idf2[w]


# print("TF*DF of: \n", df_tf_idf2)



In [None]:
# 3: Word2Vec (train.csv)
import gensim
from gensim.models import Word2Vec

train_list = train['Text'].tolist()

model = Word2Vec(train_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")


print(model)


word_model_train = []
vocab = model.wv.key_to_index.keys()
# word_model_train.append(vocab)

for sentence in train_list:
    temp = []
    for val in vocab:
        if val in sentence:
            temp.append(sum(model.wv[val]))
        else:
            temp.append(0)
    word_model_train.append(temp)

print(len(word_model_train[0]))
print(len(train_list))



In [None]:
# 3: Word2Vec (test.csv)
import gensim
from gensim.models import Word2Vec

test_list = test['Text'].tolist()

model = Word2Vec(test_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vectest.model")

print(model)

word_model_test = []
vocab = model.wv.key_to_index.keys()
# word_model_test.append(vocab)

for sentence in test_list:
    temp = []
    for val in vocab:
        if val in sentence:
            temp.append(sum(model.wv[val]))
        else:
            temp.append(0)
    word_model_test.append(temp)


In [None]:
#Combined Bag_of_words into dataframe for model training
df = pd.DataFrame(bag_of_words, columns = unique_words)
df2 = pd.DataFrame(bag_of_words_test, columns = unique_words2)

df_merged_bag = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_bag.fillna(0, inplace=True)
df_merged2_bag = pd.concat([test, train], axis=0).reset_index(drop=True)

# print(df_merged_bag.shape)
# print(df_merged2_bag.shape)


In [None]:
#Combined TF*IDF into dataframe for model training
df = pd.DataFrame(df_tf_idf, columns = unique_words)
df2 = pd.DataFrame(df_tf_idf2, columns = unique_words2)

df_merged_tf_idf = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_tf_idf.fillna(0, inplace=True)
df_merged2_tf_idf = pd.concat([test, train], axis=0).reset_index(drop=True)


In [None]:
#Combined word2Vec into dataframe for model training
df = pd.DataFrame(word_model_train, columns = unique_words)
df2 = pd.DataFrame(word_model_test, columns = unique_words2)

print(df.shape)
print(df2.shape)

df_merged_word2vec = pd.concat([df, df2], axis=0).reset_index(drop=True)
df_merged_word2vec.fillna(0, inplace=True)
df_merged2_word2vec = pd.concat([test, train], axis=0).reset_index(drop=True)

print(df_merged_word2vec.shape)
print(df_merged2_word2vec.shape)


In [None]:
'''

Part 4 - Sentiment Classification Model

'''

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1: Classification Model for Bag_of_Words

y = df_merged2_bag['Sentiment'].to_numpy()
X = df_merged_bag


# Data Scaling
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_bag_train, x_bag_test,  y_bag_train, y_bag_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_bag_train, y_bag_train)
svc.fit(x_bag_train, y_bag_train)
nbc.fit(x_bag_train, y_bag_train)
rfc.fit(x_bag_train, y_bag_train)


In [None]:

y_lc_predicted = lc.predict(x_bag_test)
print(classification_report(y_bag_test, y_lc_predicted))
print(lc.score(x_bag_test,y_bag_test))

y_svc_predicted = svc.predict(x_bag_test)
print(classification_report(y_bag_test, y_svc_predicted))
print(svc.score(x_bag_test,y_bag_test))

y_nbc_predicted = nbc.predict(x_bag_test)
print(classification_report(y_bag_test, y_nbc_predicted))
print(nbc.score(x_bag_test,y_bag_test))

y_rfc_predicted = rfc.predict(x_bag_test)
print(classification_report(y_bag_test, y_rfc_predicted))
print(rfc.score(x_bag_test,y_bag_test))


In [None]:
# 2: Classification Model for TF*IDF

y = df_merged2_tf_idf['Sentiment'].to_numpy()
X = df_merged_tf_idf


# Data Scaling
# scale = preprocessing.MinMaxScaler()
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_tf_idf_train, x_tf_idf_test,  y_tf_idf_train, y_tf_idf_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_tf_idf_train, y_tf_idf_train)
svc.fit(x_tf_idf_train, y_tf_idf_train)
nbc.fit(x_tf_idf_train, y_tf_idf_train)
rfc.fit(x_tf_idf_train, y_tf_idf_train)


In [None]:
y_lc_predicted = lc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_lc_predicted))
print(lc.score(x_tf_idf_test,y_tf_idf_test))

y_svc_predicted = svc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_svc_predicted))
print(svc.score(x_tf_idf_test,y_tf_idf_test))

y_nbc_predicted = nbc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_nbc_predicted))
print(nbc.score(x_tf_idf_test,y_tf_idf_test))

y_rfc_predicted = rfc.predict(x_tf_idf_test)
print(classification_report(y_tf_idf_test, y_rfc_predicted))
print(rfc.score(x_tf_idf_test,y_tf_idf_test))

In [None]:
# 2: Classification Model for Word2Vec


y = df_merged2_word2vec['Sentiment'].to_numpy()
X = df_merged_word2vec


# Data Scaling
# scale = preprocessing.MinMaxScaler()
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

x_word2vec_train, x_word2vec_test,  y_word2vec_train, y_word2vec_test = train_test_split(scaled_X, y, test_size = 0.3)

lc = LogisticRegression()
svc = SVC()
nbc = GaussianNB()
rfc = RandomForestClassifier()


lc.fit(x_word2vec_train, y_word2vec_train)
svc.fit(x_word2vec_train, y_word2vec_train)
nbc.fit(x_word2vec_train, y_word2vec_train)
rfc.fit(x_word2vec_train, y_word2vec_train)


In [None]:
y_lc_predicted = lc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_lc_predicted))
print(lc.score(x_word2vec_test,y_word2vec_test))

y_svc_predicted = svc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_svc_predicted))
print(svc.score(x_word2vec_test,y_word2vec_test))

y_nbc_predicted = nbc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_nbc_predicted))
print(nbc.score(x_word2vec_test,y_word2vec_test))

y_rfc_predicted = rfc.predict(x_word2vec_test)
print(classification_report(y_word2vec_test, y_rfc_predicted))
print(rfc.score(x_word2vec_test,y_word2vec_test))