In [20]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from util.clean_text import quick_text_clean
from util import model_util, model_performance
from util.util import submit_test

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
train_df = pd.read_csv('data/train_df.csv')
test_df = pd.read_csv('data/test_df.csv')

---
### Clean data

In [3]:
train_cleaned = train_df.copy()
test_cleaned = test_df.copy()

train_cleaned.message = train_cleaned.message.apply(quick_text_clean)
test_cleaned.message = test_cleaned.message.apply(quick_text_clean)

### Split and Vectorize

In [12]:
X = train_cleaned.message
y= train_cleaned.sentiment

X_train, X_test, y_train, y_test = model_util.TrainTestSplit(X, y).standard_split()

vect = model_util.ModelLibrary('count_vect').model
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

### Model Fit and Predictions

In [5]:
clf = model_util.ModelLibrary('logistic_reg').model.fit(X_train, y_train)
predictions = clf.predict(X_test)

### Model Performance

In [14]:
# Note cross_val_score performed on X_train actual results should be higher
cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro').mean()

0.6321958189335335

In [15]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          -1       0.73      0.45      0.56       253
           0       0.59      0.41      0.49       489
           1       0.76      0.87      0.81      1706
           2       0.74      0.75      0.75       716

    accuracy                           0.74      3164
   macro avg       0.71      0.62      0.65      3164
weighted avg       0.73      0.74      0.73      3164



### Submission

In [21]:
X_vect_train = vect.fit_transform(X)
X_vect_test = vect.transform(test_cleaned.message)

clf = clf.fit(X_vect_train, y)
predictions = clf.predict(X_vect_test)
tweetid = test_cleaned.tweetid

submit_test(tweetid, predictions)

Submission name and version: 
 >>> Submission
