In [1]:
import pandas as pd

import sklearn

# Preprocessing functions
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

# Plug and play classifiers
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

# Saving the model
from joblib import dump

In [2]:
def read_data(filename):
    data = pd.read_csv(filename)
    data.loc[data.Party == "Independent", "Party"] = "Democrat"
    print(data.Party.value_counts())
    X = data[["stemmed", "neg", "neu", "pos", "compound"]]
    y = data['Party']
    return X, y

In [3]:
train_X, train_y = read_data('train_data.csv')
test_X, test_y = read_data('test_data.csv')

Democrat      59171
Republican    43201
Name: Party, dtype: int64
Democrat      14793
Republican    10801
Name: Party, dtype: int64


In [4]:
import time

start = time.time()
rfc = RandomForestClassifier()
rfc.fit(train_X.drop(columns='stemmed'), train_y)
end = time.time()
print(end - start)

start = time.time()
svm = SGDClassifier('hinge', penalty='l2', max_iter=5, tol=None)
svm.fit(train_X.drop(columns='stemmed'), train_y)
end = time.time()
print(end - start)

start = time.time()
gnb = GaussianNB()
gnb.fit(train_X.drop(columns='stemmed'), train_y)
end = time.time()
print(end - start)

10.80204176902771
0.1014258861541748
0.13484907150268555


## Preprocessor Pipeline

In [5]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
#https://webcache.googleusercontent.com/search?q=cache:jfbAd7R40V8J:https://www.tutorialguruji.com/python/transfomers-for-mixed-data-types/+&cd=7&hl=en&ct=clnk&gl=us
numeric_features = ["neg", "neu", "pos", "compound"]
numeric_transformer = MinMaxScaler()

text_transformer = Pipeline(steps=[
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("norm", Normalizer())
])

numeric_transformers = [(x + '_scaler', numeric_transformer, [x]) for x in numeric_features]
 
all_transformers = numeric_transformers + [("text", text_transformer, "stemmed")] 
preprocessor = ColumnTransformer(transformers=all_transformers)

## Plug and Play Pipelines

In [6]:
def get_pipeline(preprocessor, classifier):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', classifier)
    ])

#### Random Forest Classifier

In [27]:
max_depth = 100
rfc_classifier = RandomForestClassifier(max_depth=max_depth)

rfc_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    # 'clf__n_estimators': (25, 50, 75, 100),
    # 'clf__criterion': ('gini', 'entropy'),
    # 'clf__max_depth': (10, 20, 30, None),
    # 'clf__min_samples_leaf': (1, 10, 20, 30),
    # 'clf__max_features': ('sqrt', 'log2'),
    'clf__bootstrap': (True, False),
}

rfc = get_pipeline(preprocessor, rfc_classifier)

#### SVM Classifier

In [28]:
svm_classifier = SGDClassifier(
    loss='hinge', penalty='l2',
    random_state=42, max_iter=5, tol=None
)

svm_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
    'clf__max_iter': (5, 10, 20),
    # do we want to consider early stopping and how many iterations?
}

svm = get_pipeline(preprocessor, svm_classifier)

#### Gaussian Naive Bayes Classifier

In [7]:
gnb_classifier = GaussianNB()

gnb_parameters = {
    'preprocessor__text__vect__ngram_range': [(1, 1), (1, 2)],
    'preprocessor__text__tfidf__use_idf': (True, False),
    # 'clf__priors': (),
    # 'clf__var_smoothing': ()
}

gnb = get_pipeline(preprocessor, gnb_classifier)

## Grid Search CV

In [8]:
def grid_search_clf(class_pipe, parameters, X, y):
    gs_clf = GridSearchCV(class_pipe, parameters, cv=10, n_jobs=-1)
    gs_clf = gs_clf.fit(X, y)
    '''
    print(gs_clf.score(X_test, y_test))
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    '''
    return gs_clf

In [10]:
start = time.time()
gs_gnb = grid_search_clf(gnb, gnb_parameters, train_X, train_y)
end = time.time()
print(end - start)



TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
dump(gs_gnb, './gaussian_nb_clf.joblib')

In [25]:
start = time.time()
gs_svm = grid_search_clf(svm, svm_parameters, train_X, train_y)
end = time.time()
print(end - start)

566.1158452033997


In [26]:
dump(gs_svm, './svm_clf.joblib')

['./svm_clf.joblib']

In [28]:
start = time.time()
gs_rfc = grid_search_clf(rfc, rfc_parameters, train_X, train_y)
end = time.time()
print(end - start)

3920.5807592868805


In [29]:
dump(gs_rfc, './random_forest_clf_maxdepth_maxdepth_{:d}.joblib'.format(max_depth))

['./random_forest_clf_maxdepth_maxdepth_100.joblib']