In [1]:
import pandas as pd
import numpy as np
import sklearn

from chatnet.general_classifier_model import ClassifierPipeline

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoLars, BayesianRidge, LogisticRegression

In [11]:
# Sets up the sample data frame
def str_to_list(s):
    return s[1:-1].split(", u")
df = pd.read_csv("chatnet/5-25msg_score.tsv", sep="\t")
df["msgs"] = df["msgs"].apply(str_to_list)

# Sample feature data of 4-tuple of standard normals
features = zip(np.random.standard_normal(df.shape[0]), np.random.standard_normal(df.shape[0]), 
               np.random.standard_normal(df.shape[0]), np.random.standard_normal(df.shape[0]))

# If you don't want features, have empty arrays
# features = [[] for _ in range(df.shape[0])]

df["features"] = pd.Series(features)
df_sample = df.sample(100)

In [14]:
# Defaults to SVC
cl_pipe = ClassifierPipeline(positive_class="satisfaction")
cl_pipe.setup(df_sample)
cl_pipe.run()
print "default SVC:", cl_pipe.cl.test_score

default SVC: 0.866666666667


In [15]:
# Running with arguments

# Single list of tuple of classifier, arguments
cl_pipe = ClassifierPipeline(positive_class="satisfaction")
cl_pipe.setup(df_sample)
cl_pipe.run([(AdaBoostClassifier, {"n_estimators": 100})])
print "AdaBoost:", cl_pipe.cl.test_score

# Classifier, arguments
cl_pipe.run(RandomForestClassifier, class_weight = {0: 2, 1: 1})
print "Random Forest:", cl_pipe.cl.test_score

AdaBoost: 0.466666666667
Random Forest: 0.766666666667


In [5]:
# Lists of classifiers and regressors to test
classifiers = [KNeighborsClassifier, [(SVC, {"probability": True})], DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifier, GaussianNB, LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis]
regressors = [KNeighborsRegressor, SVR, DecisionTreeRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, LinearRegression, Ridge, BayesianRidge, LogisticRegression]
classifier_sets = [[(SVC, {"probability": True, "cache_size": 3000}), (GaussianNB, )], [(SVC, {"probability": True, "cache_size": 3000}), (GaussianNB,), (DecisionTreeClassifier, {"class_weight": {0: 2, 1: 1}})]]

In [6]:
print "\nTESTING SINGLE CLASSIFIERS\n"
for classifier in classifiers:
    cl_pipe = ClassifierPipeline(positive_class="satisfaction")
    cl_pipe.setup(df_sample)
    cl_pipe.run(classifier)
    print "single classifier:", classifier, cl_pipe.cl.test_score

print "\nTESTING ENSEMBLE VOTING CLASSIFIERS"
for cl_set in classifier_sets:

    # Ensemble voting with soft voting
    print "\nSOFT VOTING"
    cl_set_soft_pipe = ClassifierPipeline(positive_class="satisfaction")
    cl_set_soft_pipe.setup(df_sample)
    cl_set_soft_pipe.run(cl_set, voting="soft")
    print "soft set:", cl_set, cl_set_soft_pipe.cl.test_score

    # Ensemble voting with hard voting
    print "\nHARD VOTING"
    cl_set_hard_pipe = ClassifierPipeline(positive_class="satisfaction")
    cl_set_hard_pipe.setup(df_sample)
    cl_set_hard_pipe.run(cl_set)
    print "hard set:", cl_set, cl_set_hard_pipe.cl.test_score

print "\nTESTING REGRESSORS\n"
for regressor in regressors:
    reg_pipe = ClassifierPipeline(positive_class="scores")
    reg_pipe.setup(df_sample)
    reg_pipe.run(regressor)
    print "regression:", regressor, reg_pipe.cl.test_score


TESTING SINGLE CLASSIFIERS

single classifier: <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 0.133333333333
single classifier: [(<class 'sklearn.svm.classes.SVC'>, {'probability': True})] 0.966666666667
single classifier: <class 'sklearn.tree.tree.DecisionTreeClassifier'> 0.1
single classifier: <class 'sklearn.ensemble.forest.RandomForestClassifier'> 0.666666666667
single classifier: <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> 0.533333333333
single classifier: <class 'sklearn.naive_bayes.GaussianNB'> 0.0333333333333
single classifier: <class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'> 0.566666666667
single classifier: <class 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis'> 0.166666666667

TESTING ENSEMBLE VOTING CLASSIFIERS

SOFT VOTING
soft set: [(<class 'sklearn.svm.classes.SVC'>, {'cache_size': 3000, 'probability': True}), (<class 'sklearn.naive_bayes.GaussianNB'>,)] 0.0333333333333

HARD VOTING
hard set: [(<class 's