### Libraries

In [1]:
import sys
sys.path.append('../python-scripts')
from fletcher import *



In [2]:
%matplotlib inline

### Overall Classifier

In [None]:
x_train, x_val, y_train, y_val = load_and_split_data()

In [None]:
overall = tweet_classifier(x_train, y_train)
overall.build_model(max_df=0.7,
                    max_features=6000,
                    ngram_range=(1,3),
                    boosted=False)
overall.predict(x_train, y_train)
print('Training Score: ', overall.score)
overall.predict(x_val, y_val)
print('Testing Score: ', overall.score)

### Stage One:
* Class 0 is a tweet with offensive language
* Class 1 is a tweet with clean language  

#### Data

In [3]:
x1_train, x1_test, y1_train, y1_test = load_and_split_data(stage=1)

In [4]:
y1_train.value_counts()

1    11599
0     2341
Name: class_second, dtype: int64

In [5]:
y1_test.value_counts()

1    3866
0     781
Name: class_second, dtype: int64

#### Helper functions

#### First Classifier
This model may be overfitting.

TFIDF with SVD is consistently performing best. I'll stick with those for now, and tune the classifier.

In [None]:
params_one = {'n_estimators': [2, 5, 10, 15, 20, 30, 50]}

In [None]:
#stage_one = grid_search(x1_train, y1_train, x1_test, y1_test, params_one, plot=True)

#### Build best model here:

In [6]:
stage_one = tweet_classifier(x1_train, y1_train)
stage_one.build_model(vectorizer_method='tfidf', reduction_method='SVD', n_estimators=20, boosted=False)
stage_one.predict(x1_test, y1_test)
stage_one.score

0.95887221934816347

### Stage Two:
* Class 0 is hate speech
* Class 1 is offensive but not hate speech

#### Data

In [7]:
x2_train, x2_test, y2_train, y2_test = load_and_split_data(stage=2)

In [None]:
y2_train.value_counts()

In [None]:
y2_test.value_counts()

#### Second Classifier

Changing the vectorizer, reduction method, and ngram range made little difference.

In [None]:
params_two = {'n_estimators': [2, 5, 10, 15, 20, 30, 50]}

In [None]:
# stage_two = grid_search(x2_train, y2_train, x2_test, y2_test, params_two)

#### Build Best Model here:

In [8]:
stage_two = tweet_classifier(x2_train, y2_train)
stage_two.build_model(ngram_range=(1, 3),
                      reduction_method="NMF",
                      vectorizer_method="tfidf",
                      n_estimators=20, boosted=False)
stage_two.predict(x2_test, y2_test)
stage_two.score

0.10074626865671642

### Validate Two Stage Process

In [None]:
x_val, y1_val, y2_val = load_and_split_data(stage=0)

In [None]:
y1_val.value_counts()

In [None]:
y2_val.value_counts()

#### Two Stage Classifier

In [9]:
x_val, y1_val, y2_val = load_and_split_data(stage=0)

In [10]:
model = hate_speech_classifier(stage_one, stage_two)
model.predict(x_val,
              y1_test=y1_val,
              y2_test=y2_val)

(0.95790494665373427, 0.0, 0.0)