In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.head())
print(test.head())

print(np.where(pd.isnull(train)))
print(np.where(pd.isnull(test)))

print(train.shape)
print(test.shape)

In [None]:
'''

Part 2 - Text Preprocessing

'''


# 1: lower-casing
train['Text'] = train['Text'].str.lower()
print(train.head())

test['Text'] = test['Text'].str.lower()
print(test.head())

# 2: remove digital numbers

import re #python regular expression library

train['Text'] = train['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
print(train.head())
print(test.head())


# 3: Remove urls
                    
train['Text'] = train['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
print(train.head())
print(test.head())


# 4: Remove username

train['Text'] = train['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
print(train.head())
print(test.head())



# 5: Remove special character and puncation 
train['Text'] = train['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
print(train.head())
print(test.head())



In [None]:
# Stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english") #define stemming dict
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

train['Text'] = train['Text'].apply(lambda x : stem_sentences(x))
test['Text'] = test['Text'].apply(lambda x : stem_sentences(x))

print(train.head())
print(test.head())


# Get sample size

train = train.sample(frac=0.01, random_state=1)
test = test.sample(frac=0.01, random_state=1)



In [None]:
'''

Part 3 - Linguistic Feature Extraction

'''

# 1. Bag of Words (train.csv)
import numpy as np

wordCount = {}
for tokens in train['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words = list(wordCount.keys())

print(unique_words)
print(len(unique_words))

bag_of_words = []

for tokens in train['Text']:
    bag_vector = np.zeros(len(unique_words))
    for words in tokens:
        for i, word in enumerate(unique_words):
            if word == words:
                bag_vector[i] += 1
    bag_of_words.append(bag_vector.tolist())


print(bag_of_words)


In [None]:
# 1. Bag of Words (test.csv)
import numpy as np

wordCount = {}
for tokens in test['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words2 = list(wordCount.keys())

print(unique_words2)
print(len(unique_words2))

bag_of_words_test = []

for tokens in test['Text']:
    bag_vector = np.zeros(len(unique_words2))
    for words in tokens:
        for i, word in enumerate(unique_words2):
            if word == words:
                bag_vector[i] += 1
    bag_of_words_test.append(bag_vector.tolist())


print(bag_of_words_test)

In [None]:
# 2. TF*IDF (train.csv)

# Find term-frequency: num of that word in sentence/ number of words in sentence

num_comments = len(train['Text'])
train_list = train['Text'].tolist()

print(train_list)

df_tf = pd.DataFrame(np.zeros((num_comments, len(unique_words))), columns = unique_words)

for i in range(num_comments):
    for w in train_list[i]:
        df_tf[w][i] = df_tf[w][i] + (1/len(train_list[i]))

print("Term Frequency: \n", df_tf)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf = {}

for w in unique_words:
    k = 0 

    for i in range(num_comments):
        if w in train_list[i]:
            k += 1
    
    idf[w] = np.log10(num_comments/k)

print("IDF of: \n", idf)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf = df_tf.copy()
for w in unique_words:
    for i in range(num_comments):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]


print("TF*DF of: \n", df_tf_idf)



In [None]:
# 2. TF*IDF (test.csv)

# Find term-frequency: num of that word/ number of words in sentence


num_comments = len(test['Text'])
test_list = test['Text'].tolist()

print(test_list)

df_tf2 = pd.DataFrame(np.zeros((num_comments, len(unique_words2))), columns = unique_words2)

for i in range(num_comments):
    for w in test_list[i]:
        df_tf2[w][i] = df_tf2[w][i] + (1/len(test_list[i]))

print("Term Frequency: \n", df_tf2)


# Inverse Document Frequency: log(num of comments / word in all sentences)
idf2 = {}

for w in unique_words2:
    k = 0 

    for i in range(num_comments):
        if w in test_list[i]:
            k += 1
    
    idf2[w] = np.log10(num_comments/k)

print("IDF of: \n", idf2)

# TF*DF = term-frequency * Inverse Document Frequency 

df_tf_idf2 = df_tf2.copy()
for w in unique_words2:
    for i in range(num_comments):
        df_tf_idf2[w][i] = df_tf2[w][i] * idf2[w]


print("TF*DF of: \n", df_tf_idf2)



In [None]:
# 3: Word2Vec (train.csv)
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

train_list = train['Text'].tolist()

model = Word2Vec(train_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

print(model)


In [None]:
# 3: Word2Vec (test.csv)
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

test_list = test['Text'].tolist()

model = Word2Vec(test_list, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

print(model)

In [None]:
'''

Part 4 - Sentiment Classification Model

'''

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report



#train.csv for bag_of_words
x_bag_train = bag_of_words
print(bag_of_words)
y_bag_train = np.array(train['Sentiment'])
print(np.array(train['Sentiment']))

# #test.csv for bag_of_words
# x_bag_test = bag_of_words_test
# y_bag_test = test['Sentiment']


lc = LogisticRegression()
# svc = SVC(probability=True)
# nbc = GaussianNB()
# rfc = RandomForestClassifier()

# lc.fit(x_bag_train, y_bag_train)
# svc.fit(x_bag_train, y_bag_train)
# nbc.fit(x_bag_train, y_bag_train)
# rfc.fit(x_bag_train, y_bag_train)


In [None]:

y_lc_predicted = lc.predict(x_bag_test)
print(classification_report(y_bag_test, y_lc_predicted))