In [1]:
import math
import string
import re

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alkal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alkal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# death_row_fn = 'gdrive/My Drive/COEN140/group-project/data/Last-Statement-of-Death-Row.csv'
# suicide_depression_fn = 'gdrive/My Drive/COEN140/group-project/data/reddit_depression_suicidewatch.csv'
# hate_fn = 'gdrive/My Drive/COEN140/group-project/data/Dynamically_Generated_Hate_Dataset_v0.2.3.csv'
# sentiment_fn = 'gdrive/My Drive/COEN140/group-project/data/sentiment.csv'
# output_fn = 'gdrive/My Drive/COEN140/group-project/data/output.dat'

#read train.csv file into dataframe
#data was pre-processed, cleaned in 'data_collection.ipynb'
data = pd.read_csv('train.csv', header=0)
df = data.iloc[:,1:]
df.dropna() #drop empty rows from dataframe
#fixes bug: thinks row 82162 is nan which messes up the pre-processing
df.iloc[82162,0] = "none"
df.iloc[82162,0]

'none'

Class labels are 

0 - Neutral 

1 - Hate

2 - Depression

3 - Suicidal

# Pre-processing

In [4]:
# split feature matrix and target values
docs = df['text']
classes = df['class']

In [5]:
# split each text sample into seperate words 
docs_mat = [word_tokenize(text) for text in docs]

In [6]:
sw_nltk = stopwords.words('english')
s = PorterStemmer()
def preprocess_docs(docs): 
  ''' Taking a matrix of documents with words in each document, 
  preprocess the matrix by removing punctuation, words less than 4 letters, 
  and stop words and return the preprocessed matrix. '''

  docs = [ [ s.stem(word) for word in sample if (word not in string.punctuation) and len(word) >= 4 and (word.lower() not in sw_nltk)] for sample in docs]
  return docs 

pp_docs = preprocess_docs(docs_mat)


In [7]:
def truncate(docs, classes): 
  # standardize each sample to have the same number of words 
  trunc = int(np.mean([len(d) for d in docs]))

  return [ (' '.join(docs[i][:trunc]), classes[i]) for i in range(len(docs)) if len(docs[i]) >= trunc]


trunc = truncate(pp_docs, classes)

In [8]:
trunc_docs = [t[0] for t in trunc]
trunc_classes = [t[1] for t in trunc]

In [28]:
docs_train, docs_test, cls_train, cls_test = train_test_split(trunc_docs, trunc_classes, train_size=0.7, test_size=0.3 , shuffle=True, random_state=1)
type(docs_train), type(docs_train[0]), docs_train[0]

(list,
 str,
 'distil gasif wast inciner method accord present invent distil gasif wast inciner method plural distil furnac provid combust furnac wast held distil furnac dry-distil sequenc therebi produc combust control carri temperatur combust furnac becom predetermin first temperatur case combust introduc combust furnac burnt method includ step suppli oxygen requir distil wast first distil furnac control degre open first valv provid first oxygen suppli passag temperatur combust furnac becom first temperatur combust combust case combust produc dry-distil wast held first distil furnac use oxygen suppli first distil furnac first oxygen suppli passag oxygen suppli sourc combust introduc combust furnac burnt step detect presenc wast second distil furnac control carri temperatur combust furnac becom first temperatur combust combust produc first distil furnac ignit wast held second distil furnac use oxygen suppli second distil furnac second oxygen suppli passag oxygen suppli sourc step dry-

In [31]:
#split all of the words in the samples within the train and test documents
docs_train2 = [doc.split() for doc in docs_train]
docs_test2 = [doc.split() for doc in docs_test]
type(docs_train2), type(docs_train2[0]), docs_train2[0]

(list,
 list,
 ['distil',
  'gasif',
  'wast',
  'inciner',
  'method',
  'accord',
  'present',
  'invent',
  'distil',
  'gasif',
  'wast',
  'inciner',
  'method',
  'plural',
  'distil',
  'furnac',
  'provid',
  'combust',
  'furnac',
  'wast',
  'held',
  'distil',
  'furnac',
  'dry-distil',
  'sequenc',
  'therebi',
  'produc',
  'combust',
  'control',
  'carri',
  'temperatur',
  'combust',
  'furnac',
  'becom',
  'predetermin',
  'first',
  'temperatur',
  'case',
  'combust',
  'introduc',
  'combust',
  'furnac',
  'burnt',
  'method',
  'includ',
  'step',
  'suppli',
  'oxygen',
  'requir',
  'distil',
  'wast',
  'first',
  'distil',
  'furnac',
  'control',
  'degre',
  'open',
  'first',
  'valv',
  'provid',
  'first',
  'oxygen',
  'suppli',
  'passag',
  'temperatur',
  'combust',
  'furnac',
  'becom',
  'first',
  'temperatur',
  'combust',
  'combust',
  'case',
  'combust',
  'produc',
  'dry-distil',
  'wast',
  'held',
  'first',
  'distil',
  'furnac',
  'u

# Model Selection 

In [21]:
def score_models(models, docs_train, cls_train): 
  scores = []
  for m in models: 

    # take the average accuracy score for the model across k-fold cross validation
    scores.append((m, np.mean(cross_val_score(m, X=docs_train, y=cls_train, scoring='accuracy'))))
  
  # sort the scores by the model with the highest accuracy 
  scores.sort(key=lambda x: x[1], reverse=True)
  return scores 

Support Vector Machine (SVM)

In [37]:
pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())])

#the model was not functioning properly when passing in docs_train after splitting all of the words in the samples
score_models([SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)], pipe.fit_transform(docs_train), cls_train)

[(SGDClassifier(alpha=0.001, max_iter=5, random_state=0, tol=None),
  0.9869172758263914)]

In [None]:
# def get_best_svm(): 
  
#   # linear SVM with stochastic gradient descent 
#   svm_models = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)

Naive Bayes Classifier

In [39]:
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())])

#the model was not functioning properly when passing in docs_train after splitting all of the words in the samples
score_models([MultinomialNB()], pipe.fit_transform(docs_train), cls_train)

[(MultinomialNB(), 0.9910185964517945)]

Classic Decision Tree

In [44]:
pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())])

docs_train3 = pipe.fit_transform(docs_train)


def get_best_dtc(): 
  depth_range = (None, 1, 10, 20, 50)
  leaf_samples_range = (1, 10, 20, 50)
  impurity_decrease_range = range(0,5)
  dtc_models = [  DecisionTreeClassifier(random_state=0, max_depth=d, min_samples_leaf=l, min_impurity_decrease=i) 
  for i in impurity_decrease_range  
  for l in leaf_samples_range 
  for d in depth_range ]

  return score_models(dtc_models, docs_train3, cls_train)



scores = get_best_dtc()
scores[0]

(DecisionTreeClassifier(max_depth=10, min_impurity_decrease=0, random_state=0),
 0.9898765752516077)