<a href="https://colab.research.google.com/github/georgehtliu/ignition-hack-2020/blob/master/submission_extras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import nltk 
import string
import re

# **Pre-processing**

## Lemmatization with Part-of-speech Tagging (using NLTK)
Significantly increases training time and decreases f1 scores by ~1%.

In [None]:
nltk.download('wordnet')
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dictionary = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dictionary.get(tag, wordnet.NOUN)

from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def wn_lemmatize(sentence, lemmatizer): 
  wordsList = sentence.split()
  for i in range(len(wordsList)): 
    if len(wordsList[i]) >= 1: 
      # print(wordsList[i])
      wordsList[i] = lemmatizer.lemmatize(wordsList[i], get_wordnet_pos(wordsList[i]))
      # print(wordsList[i])
  return ' '.join(wordsList)

# To implement: 
# mini_df["Text"] = mini_df['Text'].apply(lambda sentence: wn_lemmatize(sentence, lemmatizer))

## Name Lemmatization / Generalization
Slightly decreases f1 scores.

In [None]:
def lemmatizeName(text):
    if text[0] == '@' or text[0]=='#':
        L = text.split()
        L[0] = ''
        return ' '.join(L)
    return text

# To implement:
# df['Text'] = df['Text'].map(lambda text: lemmatize(text))

## Stop Words
Decreases f1 scores by ~1%

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return ' '.join(text)

# To implement: 
# df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))

## Remove Punctuation
Redundant due to vectorizer's built-in functionality.

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

# To implement: 
# df['Text'] = df['Text'].map(lambda text: remove_punct(text))

## Tokenization
Redundant due to vectorizer's built-in functionality.

In [None]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

# To implement: 
# df['Text'] = df['Text'].map(lambda text: tokenization(text))

# **Classifiers**

## Neural Network
Very slow to train, mediocre accuracy.

In [None]:
clf = MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(64,64))
clf.fit(X_train_vectors, y_train)
print(f1_score(y_test, clf.predict(X_test_vectors), average=None, labels=[0,1]))

## Decision Tree
Sub-par accuracy.

In [None]:
from sklearn.tree import DecisionTreeClassifier

parameters_dt = {'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random'), 'max_depth': (None, 4,100,1000)}
dt = DecisionTreeClassifier()

clf_dt = GridSearchCV(dt, parameters_dt, cv = 5)

clf_dt.fit(X_train_vectors, y_train)

print(f1_score(y_test, clf_dt.predict(X_test_vectors), average=None, labels=[0,1]))

## SVM
Incapable of handling large datasets. Good accuracy for smaller datasets.

In [None]:
clf_svm = SVC(kernel='rbf', C=4, decision_function_shape='ovo')
clf_svm.fit(X_train_vectors, y_train)

## Around 68% accuracy using 8000 of the 1M training examples
print(f1_score(y_test, clf.predict(X_test_vectors), average=None, labels=[0,1]))

## SGD
Very fast to train, but does not improve much at all as dataset size increases.

In [None]:
clf_sgd = SGDClassifier(loss='log',penalty='elasticnet',l1_ratio=0.05)
clf_sgd.fit(X_train_vectors, y_train)

print(f1_score(y_test, clf_sgd.predict(X_test_vectors), average=None, labels=[0,1]))