In [1]:
import pandas as pd

import sklearn

# Preprocessing functions
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

# Plug and play classifiers
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

# Saving the model
from joblib import dump, load

In [2]:
def read_data(filename):
    data = pd.read_csv(filename)
    data.loc[data.Party == "Independent", "Party"] = "Democrat"
    print(data.Party.value_counts())
    X = data[["stemmed", "neg", "neu", "pos", "compound"]]
    y = data['Party']
    return X, y

In [3]:
train_X, train_y = read_data('train_data.csv')
test_X, test_y = read_data('test_data.csv')

Democrat      59171
Republican    43201
Name: Party, dtype: int64
Democrat      14793
Republican    10801
Name: Party, dtype: int64


## Preprocessor Pipeline

In [4]:
def get_preprocessor(specify=False, ngram_range=None, use_idf=None):
    # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py
    # https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
    #https://webcache.googleusercontent.com/search?q=cache:jfbAd7R40V8J:https://www.tutorialguruji.com/python/transfomers-for-mixed-data-types/+&cd=7&hl=en&ct=clnk&gl=us
    numeric_features = ["neg", "neu", "pos", "compound"]
    numeric_transformer = MinMaxScaler()

    vect = CountVectorizer()
    tfidf = TfidfTransformer()
    
    if specify:
        vect.set_params(ngram_range=ngram_range)
        tfidf.set_params(use_idf=use_idf)
    
    text_transformer = Pipeline(steps=[
        ("vect", vect),
        ("tfidf", tfidf),
        ("norm", Normalizer())
    ])

    numeric_transformers = [(x + '_scaler', numeric_transformer, [x]) for x in numeric_features]

    all_transformers = numeric_transformers + [("text", text_transformer, "stemmed")] 
    preprocessor = ColumnTransformer(transformers=all_transformers)
    
    return preprocessor

## Plug and Play Pipelines

In [5]:
def get_pipeline(preprocessor, classifier):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', classifier)
    ])

In [7]:
preprocessor = get_preprocessor()

#### Random Forest Classifier

In [8]:
max_depth = 110
rfc_classifier = RandomForestClassifier(max_depth=max_depth)

rfc_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    # 'clf__n_estimators': (25, 50, 75, 100),
    # 'clf__criterion': ('gini', 'entropy'),
    # 'clf__max_depth': (10, 20, 30, None),
    # 'clf__min_samples_leaf': (1, 10, 20, 30),
    # 'clf__max_features': ('sqrt', 'log2'),
    'clf__bootstrap': (True, False)
}

rfc = get_pipeline(preprocessor, rfc_classifier)

#### SVM Classifier

In [9]:
svm_classifier = SGDClassifier(
    loss='hinge', penalty='l2', tol=None
)

svm_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    'clf__alpha': (1e-1, 5e-2, 1e-2, 1e-3),
    'clf__max_iter': (5, 10, 20)
}

svm = get_pipeline(preprocessor, svm_classifier)

#### Multinomial Naive Bayes Classifier

In [10]:
mnb_classifier = MultinomialNB()

mnb_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    'clf__alpha': (0, 0.025, 0.05, 0.075, 0.1),
    'clf__fit_prior': (True, False)
}

mnb = get_pipeline(preprocessor, mnb_classifier)

#### Linear Support Vector Classifier

In [11]:
svc_classifier = LinearSVC(penalty='l1', dual=False)

svc_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    'clf__loss': ('squared_hinge', ),
    'clf__C': (10, 20),
    'clf__max_iter': (5, 10, 20)
}

svc = get_pipeline(preprocessor, svc_classifier)

#### Standard SVC (to get probability scores)

In [12]:
svc_classifier = SVC(probability=True)

svc_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'clf__C': (0.5, 1, 5, 10),
    #'clf__max_iter': (5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
}

svc = get_pipeline(preprocessor, svc_classifier)

## Grid Search CV

In [13]:
def grid_search_clf(class_pipe, parameters, X, y):
    gs_clf = GridSearchCV(class_pipe, parameters, cv=10, n_jobs=-1)
    gs_clf = gs_clf.fit(X, y)
    '''
    print(gs_clf.score(X_test, y_test))
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    '''
    return gs_clf

In [None]:
start = time.time()
gs = grid_search_clf(svc, svc_parameters, train_X, train_y)
end = time.time()
print(end - start)

In [123]:
dump(gs, './svc_clf_l1.joblib')

['./svc_clf_l1.joblib']

In [None]:
gs.score(train_X, train_y)

In [None]:
gs.score(test_X, test_y)

In [None]:
gs.best_params_

## Retraining the best models.

### 1. Random Forest Classifier

#### Compare the 10 random forest classifier models.

The only differences between the models are the maximum depths that the trees were allowed to grow. Each of the forests were trained separately due to how much time each took (from ~10 min for max_depth = 10 to ~1 hour for max_depth = 100).

In [13]:
random_forests = []
for i in range(10, 101, 10):
    random_forests.append(load('./random_forest_clf_maxdepth_{:d}.joblib'.format(i)))

In [14]:
max_train = -1
max_rcf = None

for i in range(10):
    score = random_forests[i].score(train_X, train_y)
    if score > max_train:
        max_train = score
        max_rcf = random_forests[i]
        
    print("Max_Depth:", (i + 1) * 10)
    print("\tTrain:", score)
    print("\tTest:", random_forests[i].score(test_X, test_y))

Max_Depth: 10
	Train: 0.583724065174071
	Test: 0.5826365554426819
Max_Depth: 20
	Train: 0.6427538780135194
	Test: 0.633820426662499
Max_Depth: 30
	Train: 0.7146290001172195
	Test: 0.6866062358365241
Max_Depth: 40
	Train: 0.7763450943617395
	Test: 0.7182933500039071
Max_Depth: 50
	Train: 0.8404153479467041
	Test: 0.7424396342892865
Max_Depth: 60
	Train: 0.8917379752276013
	Test: 0.7631085410643119
Max_Depth: 70
	Train: 0.9247255108818818
	Test: 0.7722513089005235
Max_Depth: 80
	Train: 0.9537666549447115
	Test: 0.7812377901070563
Max_Depth: 90
	Train: 0.9681651232758958
	Test: 0.7877627568961475
Max_Depth: 100
	Train: 0.9824366037588403
	Test: 0.793584433851684


In [15]:
max_rcf.best_params_

{'clf__bootstrap': False,
 'preprocessor__text__tfidf__use_idf': False,
 'preprocessor__text__vect__ngram_range': (1, 1)}

In [16]:
max_train

0.9824366037588403

In [14]:
rfc_preprocessor = get_preprocessor(specify=True, ngram_range=(1, 1), use_idf=False)

In [16]:
rfc_classifier = RandomForestClassifier(max_depth=100, bootstrap=False, verbose=2)

rfc_retrain = get_pipeline(rfc_preprocessor, rfc_classifier)
rfc_retrain.fit(train_X, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  8.0min finished


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('neg_scaler', MinMaxScaler(),
                                                  ['neg']),
                                                 ('neu_scaler', MinMaxScaler(),
                                                  ['neu']),
                                                 ('pos_scaler', MinMaxScaler(),
                                                  ['pos']),
                                                 ('compound_scaler',
                                                  MinMaxScaler(),
                                                  ['compound']),
                                                 ('text',
                                                  Pipeline(steps=[('vect',
                                                                   CountVectorizer()),
                                                                  ('tfidf',
                                                   

In [17]:
dump(rfc_retrain, 'rfc_trained_model.joblib')

['rfc_trained_model.joblib']

In [18]:
rfc_retrain.score(train_X, train_y), rfc_retrain.score(test_X, test_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished


(0.9801508224905248, 0.7898726263968118)

### 2. Multinomial Naive Bayes Classifier 

In [19]:
gs_mnb = load('./multinomial_nb_clf.joblib')

In [20]:
gs_mnb.score(train_X, train_y), gs_mnb.score(test_X, test_y)

(0.9870765443676005, 0.8469953895444244)

In [21]:
gs_mnb.best_params_

{'clf__alpha': 0.1,
 'clf__fit_prior': False,
 'preprocessor__text__tfidf__use_idf': True,
 'preprocessor__text__vect__ngram_range': (1, 2)}

In [22]:
mnb_preprocessor = get_preprocessor(specify=True, ngram_range=(1, 2), use_idf=True)

In [24]:
mnb_classifier = MultinomialNB(alpha=0.1, fit_prior=False)

mnb_retrain = get_pipeline(mnb_preprocessor, mnb_classifier)
mnb_retrain.fit(train_X, train_y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('neg_scaler', MinMaxScaler(),
                                                  ['neg']),
                                                 ('neu_scaler', MinMaxScaler(),
                                                  ['neu']),
                                                 ('pos_scaler', MinMaxScaler(),
                                                  ['pos']),
                                                 ('compound_scaler',
                                                  MinMaxScaler(),
                                                  ['compound']),
                                                 ('text',
                                                  Pipeline(steps=[('vect',
                                                                   CountVectorizer(ngram_range=(1,
                                                                                                2))),
             

In [25]:
dump(mnb_retrain, './mnb_trained_model.joblib')

['./mnb_trained_model.joblib']

In [27]:
mnb_retrain.score(train_X, train_y), mnb_retrain.score(test_X, test_y)

(0.9870765443676005, 0.8469953895444244)

### 3. Support Vector Machines

Trained three different SVM classifiers:
1. Used SGD with hinge loss and penalty l2.
2. Used LinearSVC with penalty l2 and grid search.
3. Used LinearSVC with penalty l1 and grid search (not including hinge loss).

In [50]:
svm = load('./svm_clf.joblib')
svc_l2 = load('./svc_clf_l2.joblib')
svc_l1 = load('./svc_clf_l1.joblib')

In [51]:
print("SGD classifier:", svm.score(train_X, train_y), svm.score(test_X, test_y))
print("SVC L2:", svc_l2.score(train_X, train_y), svc_l2.score(test_X, test_y))
print("SVC L1:", svc_l1.score(train_X, train_y), svc_l1.score(test_X, test_y))

SGD classifier: 0.7255499550658383 0.7214581542549034
SVC L2: 0.9962587426249365 0.8566851605845119
SVC L1: 0.5752647208220998 0.5741189341251856


In [52]:
clf = svc_l2
clf.best_params_

{'clf__C': 1,
 'clf__loss': 'squared_hinge',
 'clf__max_iter': 10,
 'preprocessor__text__tfidf__use_idf': True,
 'preprocessor__text__vect__ngram_range': (1, 2)}

In [47]:
svc_preprocessor = get_preprocessor(specify=True, ngram_range=(1, 2), use_idf=True)

In [54]:
svc_classifier = LinearSVC(C=1, dual=False, max_iter=10, loss='squared_hinge', penalty='l2')

svc_retrain = get_pipeline(svc_preprocessor, svc_classifier)
svc_retrain.fit(train_X, train_y)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('neg_scaler', MinMaxScaler(),
                                                  ['neg']),
                                                 ('neu_scaler', MinMaxScaler(),
                                                  ['neu']),
                                                 ('pos_scaler', MinMaxScaler(),
                                                  ['pos']),
                                                 ('compound_scaler',
                                                  MinMaxScaler(),
                                                  ['compound']),
                                                 ('text',
                                                  Pipeline(steps=[('vect',
                                                                   CountVectorizer(ngram_range=(1,
                                                                                                2))),
             

In [55]:
svc_retrain.score(train_X, train_y), svc_retrain.score(test_X, test_y)

(0.9962587426249365, 0.8566851605845119)

In [58]:
svc_classifier_no_max = LinearSVC(C=1, dual=False, loss='squared_hinge', penalty='l2')

svc_retrain_no_max = get_pipeline(svc_preprocessor, svc_classifier_no_max)
svc_retrain_no_max.fit(train_X, train_y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('neg_scaler', MinMaxScaler(),
                                                  ['neg']),
                                                 ('neu_scaler', MinMaxScaler(),
                                                  ['neu']),
                                                 ('pos_scaler', MinMaxScaler(),
                                                  ['pos']),
                                                 ('compound_scaler',
                                                  MinMaxScaler(),
                                                  ['compound']),
                                                 ('text',
                                                  Pipeline(steps=[('vect',
                                                                   CountVectorizer(ngram_range=(1,
                                                                                                2))),
             

In [59]:
svc_retrain_no_max.score(train_X, train_y), svc_retrain_no_max.score(test_X, test_y)

(0.9963173524010471, 0.85695866218645)

In [60]:
dump(svc_retrain_no_max, './svc_trained_model.joblib')

['./svc_trained_model.joblib']

# Notes

RFC
- training and test accuracy continue to increase up to max depth 100
- training takes a few min for max depth 10, almost an hour for max depth 100

GaussianNB
- can't run because dense matrix takes up too much memory even with colab gpu

MultinomialNB
- best test accuracy at around 84%
- works because we normalized/scaled (doesn't like negative feature values)

SVM
- best accuracy seems to be 72%
- train and test accuracy closely mirror each other

Linear SVC
- got accuracy up to 84%

Probabilities can be calculated with just SVC but inefficient (fit is at least $N^2$ - docs say impractical for more than tens of thousands of samples - and getting probabilities uses 5-fold cross validation during fit). 

SVC 
- convergence warning at max_iter 70, kernel rbf, C = 0.5

In [29]:
range(5)

range(0, 5)

In [33]:
import numpy as np
a = np.asarray([1, 2, 3, 4, 5])

In [34]:
a[range(len(a))]

array([1, 2, 3, 4, 5])

In [35]:
a[:]

array([1, 2, 3, 4, 5])

In [36]:
dim = 5

In [37]:
a[:dim]

array([1, 2, 3, 4, 5])

In [38]:
a[dim:]

array([], dtype=int32)