In [None]:
import math
import string
import re

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
death_row_fn = 'gdrive/My Drive/COEN140/group-project/data/Last-Statement-of-Death-Row.csv'
suicide_depression_fn = 'gdrive/My Drive/COEN140/group-project/data/reddit_depression_suicidewatch.csv'
hate_fn = 'gdrive/My Drive/COEN140/group-project/data/Dynamically_Generated_Hate_Dataset_v0.2.3.csv'
sentiment_fn = 'gdrive/My Drive/COEN140/group-project/data/sentiment.csv'
output_fn = 'gdrive/My Drive/COEN140/group-project/data/output.dat'

# Data Cleaning and Extraction 

Class labels are 

0 - Neutral 

1 - Hate

2 - Depression

3 - Suicidal

Clean the death row last statements csv 

In [None]:
# extract and rename last statement feature
deathrow_df = pd.read_csv(death_row_fn)
deathrow_df['text'] = deathrow_df['last_statement']
deathrow_df.drop(deathrow_df.columns.difference(['text']), axis = 1, inplace = True)

# label all samples as suicidal
deathrow_df['class'] = 3

# filter out all of the samples with no statement given 
deathrow_df = deathrow_df[~deathrow_df['text'].isin(["No statement given."])]
deathrow_df

Unnamed: 0,text,class
0,To my friends and family it was a nice journey...,3
1,"Yes Sir, I would like to thank the Shape Commu...",3
2,Yes Sir. Dear Heavenly Father please forgive t...,3
3,I am very thankful for all the hard work the M...,3
5,"Thank you I love you all. Sandra, nice meeting...",3
...,...,...
554,I pray that my family will rejoice and will fo...,3
555,"When asked if he had a last statement, he repl...",3
556,What is about to transpire in a few moments is...,3
557,This offender declined to make a last statement.,3


In [None]:
# extract text and class labels 
suicide_df = pd.read_csv(suicide_depression_fn)

# numerically represent class labels 
suicide_df['class'] = [2 if cls == "depression" else 3 for cls in suicide_df['label']] 
suicide_df.drop('label', axis=1, inplace=True)

suicide_df

Unnamed: 0,text,class
0,I recently went through a breakup and she said...,2
1,"I do not know how to navigate these feelings, ...",2
2,"So I have been with my bf for 5 months , and h...",2
3,I am so exhausted of this. Just when I think I...,3
4,I have been severly bullied since i was 5 till...,2
...,...,...
20358,I took 50mg of seroquel a few hours after I dr...,3
20359,that is what has happened to me last week. And...,2
20360,Ever just feel alone in a house full of people...,2
20361,Politicians. Neighbors. Corporations. Society....,2


In [None]:
hate_df = pd.read_csv(hate_fn)

# extract text and class labels 
hate_df = hate_df.loc[:,['text','label']]
hate_df.rename({'label' : 'class'}, axis=1, inplace=True)

# numerically represent class labels 
hate_df['class'] = [0 if cls == "nothate" else 1 for cls in hate_df['class']] 
hate_df

Unnamed: 0,text,class
0,dalits are lowlives,1
1,gay people are a burden to our society,1
2,Arabs are not welcome,1
3,I'm not saying we should actually eliminate he...,1
4,bananas are for black people,1
...,...,...
41139,Indian culture is so backwards,1
41140,"Women don't want equality, they just want to b...",1
41141,fuck covid,0
41142,This computer is garbage,0


In [None]:
sentiment_df = pd.read_csv('gdrive/My Drive/COEN140/group-project/data/sentiment.csv')

# extract text with only neutral labels 
sentiment_df = sentiment_df[sentiment_df['target'] == 0]
sentiment_df.rename({'target' : 'class'}, axis=1, inplace=True)
sentiment_df

Unnamed: 0,text,class
0,An image forming apparatus of the present inve...,0
3,"First Aspect of Invention', 'The present inven...",0
10,The electronic device according to the present...,0
11,The objects of the present invention can be im...,0
27,"The inventors took note of the fact that, to i...",0
...,...,...
149981,The means for addressing the problem according...,0
149988,"According to the present invention, there is p...",0
149991,"In order to solve the above problem, a microsc...",0
149995,The ultrasonic atomizing device of the present...,0


In [None]:
deathrow_df.shape, suicide_df.shape, sentiment_df.shape, hate_df.shape

((558, 2), (20363, 2), (50000, 2), (41144, 2))

In [None]:
# create a new dataframe of the labeled text 
df = pd.concat([deathrow_df, sentiment_df, suicide_df, hate_df], ignore_index=True)
df.head()

Unnamed: 0,text,class
0,To my friends and family it was a nice journey...,3
1,"Yes Sir, I would like to thank the Shape Commu...",3
2,Yes Sir. Dear Heavenly Father please forgive t...,3
3,I am very thankful for all the hard work the M...,3
4,"Thank you I love you all. Sandra, nice meeting...",3


Resample the dataframe so that suicide and hate classes are not overrepresented 

In [None]:
# extract the rows of the same class 
df_none_cls = df[df['class'].eq(0)]
df_hate_cls = df[df['class'].eq(1)]
df_dep_cls = df[df['class'].eq(2)]
df_suicide_cls = df[df['class'].eq(3)]
df_none_cls.shape, df_hate_cls.shape, df_dep_cls.shape, df_suicide_cls.shape

((68969, 2), (22175, 2), (10371, 2), (10550, 2))

In [None]:
# approximately 1/9 of the dataset will be non-neutral
n_samples = math.floor(df_none_cls.shape[0] / 30) 
df = pd.concat([df_none_cls, df_hate_cls.sample(n=n_samples, random_state=1), df_dep_cls.sample(n=n_samples, random_state=1), df_suicide_cls.sample(n=n_samples, random_state=1)], ignore_index=True)
df

Unnamed: 0,text,class
0,An image forming apparatus of the present inve...,0
1,"First Aspect of Invention', 'The present inven...",0
2,The electronic device according to the present...,0
3,The objects of the present invention can be im...,0
4,"The inventors took note of the fact that, to i...",0
...,...,...
75858,This world does not need me I want to end me p...,3
75859,Would like to get this out. There are rare tim...,3
75860,I have lost it. I am cutting myself. slapping ...,3
75861,"I am okay, but this is something that I have b...",3


# Pre-processing

In [None]:
# split feature matrix and target values
docs = df['text']
classes = df['class']

In [None]:
# split each text sample into seperate words 
docs_mat = [word_tokenize(text) for text in docs]

In [None]:
sw_nltk = stopwords.words('english')
s = PorterStemmer()
def preprocess_docs(docs): 
  ''' Taking a matrix of documents with words in each document, 
  preprocess the matrix by removing punctuation, words less than 4 letters, 
  and stop words and return the preprocessed matrix. '''

  docs = [ [ s.stem(word) for word in sample if (word not in string.punctuation) and len(word) >= 4 and (word.lower() not in sw_nltk)] for sample in docs]
  return docs 

pp_docs = preprocess_docs(docs_mat)


In [None]:
def truncate(docs, classes): 
  # standardize each sample to have the same number of words 
  trunc = int(np.mean([len(d) for d in docs]))

  return [ (' '.join(docs[i][:trunc]), classes[i]) for i in range(len(docs)) if len(docs[i]) >= trunc]


trunc = truncate(pp_docs, classes)

In [34]:
trunc_docs = [t[0] for t in trunc]
trunc_classes = [t[1] for t in trunc]

In [47]:
docs_train, docs_test, cls_train, cls_test = train_test_split(trunc_docs, trunc_classes, train_size=0.7, test_size=0.3 , shuffle=True, random_state=1)

In [36]:
docs_train = [doc.split() for doc in docs_train]
docs_test = [doc.split() for doc in docs_test]

# Model Selection 

In [50]:
def score_models(models, docs_train, cls_train): 
  scores = []
  for m in models: 

    # take the average accuracy score for the model across k-fold cross validation
    scores.append((m, np.mean(cross_val_score(m, X=docs_train, y=cls_train, scoring='accuracy'))))
  
  # sort the scores by the model with the highest accuracy 
  scores.sort(key=lambda x: x[1], reverse=True)
  return scores 

Support Vector Machine (SVM)

In [51]:
pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())])

score_models([SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)], pipe.fit_transform(docs_train), cls_train)

[(SGDClassifier(alpha=0.001, max_iter=5, random_state=0, tol=None),
  0.9929372197309417)]

In [None]:
# def get_best_svm(): 
  
#   # linear SVM with stochastic gradient descent 
#   svm_models = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)

Naive Bayes Classifier

Classic Decision Tree

In [None]:
def get_best_dtc(): 
  depth_range = (None, 1, 10, 20, 50)
  leaf_samples_range = (None, 1, 10, 20, 50)
  impurity_decrease_range = range(0,5)
  dtc_models = [  DecisionTreeClassifier(random_state=0, max_depth=d, min_samples_leaf=l, min_impurity_decrease=i) 
  for i in impurity_decrease_range  
  for l in leaf_samples_range 
  for d in depth_range ]

  return score_models(dtc_models)

scores = get_best_dtc()
scores[0]