In [1]:
#importing necessary packages and libraries
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from tqdm.notebook import tqdm, trange

import time
from time import sleep

import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision.utils import make_grid

import os

import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.cElementTree as et
from collections import defaultdict

In [None]:
#mounting google drive (where the dataset is stored)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device, torch.get_num_threads()

(device(type='cpu'), 1)

In [3]:
# reading the input files
training_df = pd.read_csv('training_df.csv')
validation_df = pd.read_csv('validation_df.csv')
testing_df = pd.read_csv('testing_df.csv')

In [4]:
training_df = training_df.fillna('')
validation_df = validation_df.fillna('')
testing_df = testing_df.fillna('')

In [23]:
# over- and under-sampling to help address class imbalance
positive_samples = training_df.loc[training_df["Ground_Truth"] == 1]
negative_samples = training_df.loc[training_df["Ground_Truth"] == 0]
training_df = pd.concat([positive_samples, positive_samples, negative_samples.sample(frac=0.4)], ignore_index=True)

Baseline 1: Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

Method A: Count Vectorizer

In [26]:
count_vec = CountVectorizer(binary=False, max_df=0.95)

In [27]:
count_vec.fit_transform(training_df['Comments'])

<45857x46667 sparse matrix of type '<class 'numpy.int64'>'
	with 1168126 stored elements in Compressed Sparse Row format>

In [28]:
train_feature_set=count_vec.transform(training_df['Comments'].values)
val_feature_set=count_vec.transform(validation_df['Comments'].values)
test_feature_set=count_vec.transform(testing_df['Comments'].values)

In [29]:
X_train = train_feature_set
X_val = val_feature_set
X_test = test_feature_set

In [30]:
Y_train = training_df['Ground_Truth'].values
Y_val = validation_df['Ground_Truth'].values
Y_test = testing_df['Ground_Truth'].values

In [31]:
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=500)
log_reg_model = log_reg.fit(X_train,Y_train)

[LibLinear]

In [32]:
Y_pred = log_reg_model.predict(X_val)

In [33]:
Y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
from sklearn.metrics import f1_score

In [35]:
f1_score(Y_val, Y_pred)

0.3076923076923077

Method B: TF-IDF Vectorizer

In [None]:
tfidf_vec = TfidfVectorizer(use_idf=True, max_df=0.95)

In [None]:
tfidf_vec.fit_transform(training_df['Comments'])

<45857x47259 sparse matrix of type '<class 'numpy.float64'>'
	with 1170381 stored elements in Compressed Sparse Row format>

In [None]:
train_feature_set=tfidf_vec.transform(training_df['Comments'].values)
val_feature_set=tfidf_vec.transform(validation_df['Comments'].values)
test_feature_set=tfidf_vec.transform(testing_df['Comments'].values)

In [None]:
X_train = train_feature_set
X_val = val_feature_set
X_test = test_feature_set

In [None]:
Y_train = training_df['Ground_Truth'].values
Y_val = validation_df['Ground_Truth'].values
Y_test = testing_df['Ground_Truth'].values

In [None]:
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=1, penalty='l2',max_iter=500)
log_reg_model = log_reg.fit(X_train,Y_train)

[LibLinear]

In [None]:
Y_pred = log_reg_model.predict(X_val)

In [None]:
Y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
f1_score(Y_val, Y_pred)

0.36011504099865377

Preprocessing:


*   Removal of Punctuation
*   Converting to lowercase
*   Tokenization
*   Stop Word Removal
*   Stemming
*   Lemmatization







In [None]:
import string
import re
import nltk
nltk.download("stopwords")
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_punctuation(text):
  punc_text="".join([i for i in text if i not in string.punctuation])
  return punc_text

In [None]:
def convert_to_lower(text):
  return text.lower()

In [None]:
def tokenize(text):
  tokens = re.split('W+',text)
  return tokens

In [None]:
def remove_stop_words(text):
  clean_text = [i for i in text if i not in stopwords]
  return clean_text

In [None]:
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
def stem_text(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

In [None]:
def lemmatize_text(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word, 'v') for word in text]
  return lemm_text

In [None]:
def list_to_string(text):
  string_text = ' '.join(map(str,text))
  return string_text

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:remove_punctuation(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:remove_punctuation(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:remove_punctuation(x))

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:convert_to_lower(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:convert_to_lower(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:convert_to_lower(x))

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:tokenize(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:tokenize(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:tokenize(x))

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:remove_stop_words(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:remove_stop_words(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:remove_stop_words(x))

In [None]:
#training_df['Comments']=training_df['Comments'].apply(lambda x:stem_text(x))
#validation_df['Comments']=validation_df['Comments'].apply(lambda x:stem_text(x))
#testing_df['Comments']=testing_df['Comments'].apply(lambda x:stem_text(x))

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:lemmatize_text(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:lemmatize_text(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:lemmatize_text(x))

In [None]:
training_df['Comments']=training_df['Comments'].apply(lambda x:list_to_string(x))
validation_df['Comments']=validation_df['Comments'].apply(lambda x:list_to_string(x))
testing_df['Comments']=testing_df['Comments'].apply(lambda x:list_to_string(x))

In [None]:
count_vec = CountVectorizer(binary=False, max_df=0.95)

In [None]:
count_vec.fit_transform(training_df['Comments'])
X_train=count_vec.transform(training_df['Comments'].values)
X_val=count_vec.transform(validation_df['Comments'].values)
X_test=count_vec.transform(testing_df['Comments'].values)
Y_train = training_df['Ground_Truth'].values
Y_val = validation_df['Ground_Truth'].values
Y_test = testing_df['Ground_Truth'].values

In [None]:
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=1, penalty='l2',max_iter=500)
log_reg_model = log_reg.fit(X_train,Y_train)
Y_pred = log_reg_model.predict(X_val)
recall_score(Y_val, Y_pred, average='weighted')

[LibLinear]

0.900723403497736

In [None]:
f1_score(Y_val, Y_pred, average='weighted')

0.8765157846929039

In [None]:
tfidf_vec = TfidfVectorizer(use_idf=True, max_df=0.95)

In [None]:
tfidf_vec.fit_transform(training_df['Comments'])
X_train=tfidf_vec.transform(training_df['Comments'].values)
X_val=tfidf_vec.transform(validation_df['Comments'].values)
X_test=tfidf_vec.transform(testing_df['Comments'].values)
Y_train = training_df['Ground_Truth'].values
Y_val = validation_df['Ground_Truth'].values
Y_test = testing_df['Ground_Truth'].values

In [None]:
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=1, penalty='l2',max_iter=500)
log_reg_model = log_reg.fit(X_train,Y_train)
Y_pred = log_reg_model.predict(X_val)

[LibLinear]

In [None]:
f1_score(Y_val, Y_pred)

0.3617266012834484