<a href="https://colab.research.google.com/github/georgehtliu/ignition-hack-2020/blob/master/ignition-hacks-2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = input("Please enter the path to your training_data.csv file in your Google Drive. (Right click the file on the left and click copy path and paste it in here.)")
df = pd.read_csv(path)
df = df[['Text','Sentiment']]

## Lemitize 
def lemitize(text):
    if text[0] == '@' or text[0]=='#':
        L = text.split()
        L[0] = 'David'
        return ' '.join(L)
    return text

df['Text'] = df['Text'].map(lambda text: lemitize(text))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


KeyboardInterrupt: ignored

In [None]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [None]:
# # Data is split 50-50 between 0-1
# df = pd.read_csv('training_data.csv')
# df = df[['Text','Sentiment']]

In [None]:
mini_df = df.sample(20000)

X = np.array(mini_df['Text'])
y = np.array(mini_df['Sentiment'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
# Tfidf works better than count vectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

## Neural Network

In [None]:
clf = MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(64,64))
clf.fit(X_train_vectors, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(64, 64), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
print(f1_score(y_test, clf.predict(X_test_vectors), average=None, labels=[0,1]))

[0.69952464 0.69997502]


#Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

parameters_dt = {'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random'), 'max_depth': (None, 4,100,1000)}
dt = DecisionTreeClassifier()

clf_dt = GridSearchCV(dt, parameters_dt, cv = 5)

clf_dt.fit(X_train_vectors, y_train)

print(f1_score(y_test, clf_dt.predict(X_test_vectors), average=None, labels=[0,1]))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(X_train_vectors.toarray(), y_train)

print(f1_score(y_test, clf_gnb.predict(X_test_vectors.toarray())))

0.3763329893360853


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

parameters_log = {'C': (1.0,4.0,8.0,16.0,32.0), 'solver': ('sag', 'saga', 'lbfgs', 'newton-cg')}
log = LogisticRegression(max_iter=1000)

clf_log = GridSearchCV(log, parameters_log, cv=5)

clf_log.fit(X_train_vectors, y_train)

print(f1_score(y_test, clf_log.predict(X_test_vectors), average=None, labels=[0,1]))

[0.7345679  0.74902724]


In [None]:
print(clf_log.best_params_)

{'C': 1.0, 'solver': 'sag'}


**SVM**

In [None]:
clf = SVC(kernel='rbf', C=4, decision_function_shape='ovo')
clf.fit(X_train_vectors, y_train)



## Around 68% accuracy using 8000 of the 1M training examples
print(f1_score(y_test, clf.predict(X_test_vectors), average=None, labels=[0,1]))
