### Libraries

In [1]:
import sys
sys.path.append('../python-scripts')
from fletcher import *



In [2]:
%matplotlib inline

### Dummy Classifier

In [3]:
dummy, actual = load_and_split_data(stage='dummy')
f1_score(dummy, actual, average='weighted')

  'recall', 'true', average, warn_for)


0.87280831419280014

### Overall Classifier

In [4]:
x_train, x_val, y_train, y_val = load_and_split_data()

In [None]:
params_overall = {'n_estimators': np.arange(20, 700, 10)}
overall = grid_search(x_train, x_val, y_train, y_val,
                      params_overall,
                      summary_plot=True,
                      print_steps=True,
                      log_file=True,
                      stage='Tuning Overall Classifier Overnight')

In [5]:
overall = tweet_classifier(x_train, y_train)
overall.build_model(boosted=True,
                    n_estimators=50)
overall.predict(x_train, y_train)
print('Training Score: ', overall.score)
overall.predict(x_val, y_val)
print('Testing Score: ', overall.score)

Training Score:  0.875580610981
Testing Score:  0.870467333223


Test on false flag

In [None]:
tweet = load_pickle('potential-false-flags/quoting-abuse-1.pkl')
false_flag_tweet = [tweet['statuses'][0]['text']]
false_flag_tweet

In [None]:
overall.predict(false_flag_tweet, [1])
overall.score

Model correctly classifies this as offensive but not hate speech.

### Stage One:
* Class 0 is a tweet with offensive language
* Class 1 is a tweet with clean language  

#### Data

In [None]:
x1_train, x1_test, y1_train, y1_test = load_and_split_data(stage=1)

In [None]:
y1_train.value_counts()

In [None]:
y1_test.value_counts()

#### Helper functions

#### First Classifier
This model may be overfitting.

TFIDF with SVD is consistently performing best. I'll stick with those for now, and tune the classifier.

In [None]:
params_one = {'ngram_range': [(1,2), (1,3)],
              'max_df': [0.3, 0.4, 0.5, 0.6, 0.7],
              'max_features': [2000, 3000, 4000, 5000, 6000]}

In [None]:
stage_one = grid_search(x1_train, y1_train, x1_test, y1_test,
                        params_two_estimators,
                        summary_plot=False,
                        print_steps=True,
                        log_file=True,
                        stage='Stage One')

#### Build best model here:

In [None]:
stage_one = tweet_classifier(x1_train, y1_train)
stage_one.build_model(boosted=True,
                      n_estimators=560,
                      max_df=0.5,
                      max_features=5000,
                      ngram_range=(1,3))
stage_one.predict(x1_test, y1_test)
stage_one.score

In [None]:
save_pickle(stage_one, 'stage_one_model_best_xgboost.pkl')

### Stage Two:
* Class 0 is hate speech
* Class 1 is offensive but not hate speech

#### Data

In [None]:
x2_train, x2_test, y2_train, y2_test = load_and_split_data(stage=2)

In [None]:
y2_train.value_counts()

In [None]:
y2_test.value_counts()

#### Second Classifier

Changing the vectorizer, reduction method, and ngram range made little difference.

In [None]:
params_two = {'ngram_range': [(1,2), (1,3)],
              'max_df': [0.4, 0.5, 0.6],
              'max_features': [2000, 3000, 4000, 5000, 6000],
              'n_estimators': [250, 275, 300, 325, 350, 375, 400],
              'n_components': [30, 40, 50, 60, 70, 80, 90],
              'max_depth': [2, 3]}

In [None]:
params_two_estimators = {'n_estimators': np.arange(10, 301, 10)}

In [None]:
stage_two = grid_search(x2_train, y2_train, x2_test, y2_test,
                        params_two_estimators,
                        print_steps=True,
                        summary_plot=True,
                        log_file=True,
                        stage='Stage Two Testing Estimators')

#### Build Best Model here:

In [None]:
stage_two = tweet_classifier(x2_train, y2_train)
stage_two.build_model(boosted=True,
                      n_estimators=50)
stage_two.predict(x2_test, y2_test)
stage_two.score

In [None]:
len(stage_two.pred)

In [None]:
len(stage_two.pred[stage_two.pred == 0])

In [None]:
len(stage_two.pred[stage_two.pred == 1])

In [None]:
save_pickle(stage_two, 'stage_two_model_best_xgboost.pkl')

### Validate Two Stage Process

In [None]:
x_val, y1_val, y2_val = load_and_split_data(stage=0)

In [None]:
y1_val.value_counts()

In [None]:
y2_val.value_counts()

#### Two Stage Classifier

In [None]:
stage_one = load_pickle('stage_one_model_best_xgboost.pkl')
stage_two = load_pickle('stage_two_model_best_xgboost.pkl')

In [None]:
model = hate_speech_classifier(stage_one, stage_two)
model.predict(x_val,
              y1_test=y1_val,
              y2_test=y2_val)

The first stage classifier is performing equally well on the training and testing data. The second stage classifier is performing too well on the training data and ok on the testing data. That model is probably overfitting.