In [1]:
import nltk
import numpy
import pandas
import sklearn
import sklearn.pipeline
import sklearn.grid_search
import sklearn.decomposition

# import matplotlib.pyplot as plt
# %matplotlib inline
# %matplotlib notebook

import sqlite3

import csshelper
import cssfeature
import csspipe
import csstransformer


db_path = './data.sqlite3'
sqlite_connection = sqlite3.connect(db_path)

In [2]:
data_reader_collection = [
        ('cExt', csshelper.CExtReader(sqlite_connection))
    
        , ('cNeu', csshelper.CNeuReader(sqlite_connection))
     
        , ('cAgr', csshelper.CAgrReader(sqlite_connection))
    
        , ('cCon', csshelper.CConReader(sqlite_connection))
    
        , ('cOpn', csshelper.COpnReader(sqlite_connection)) 
]

In [3]:
aggregator = csstransformer.Aggregator([
        csstransformer.PartOfSpeech(),
        csstransformer.SentenceLength(),
        csstransformer.NumberOfWords(),
        csstransformer.NumberOfCommas(),
        csstransformer.NumberOfDots(),
        csstransformer.NumberOfSemicolons(),
        csstransformer.NumberOfColons(),
        csstransformer.LexicalDiversity(),
        csstransformer.AverageWordLength(),
        csstransformer.NumberOfFunctionalWords(),
        csstransformer.NumberOfPronouns(),
        csstransformer.NumberOfPropnames(),
])

data = csshelper.CExtReader(sqlite_connection).get_results()

# aggregator.transform(data.sentence)


In [4]:
grid_parameter_collection = {
    'classifierLinearSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                           'features__derived_numeric__best': (5,9)},
    'classifierSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                     'features__derived_numeric__best': (5,9)},
    'classifierNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                    'features__derived_numeric__best': (5,9)},
    'classifierBNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                     'features__derived_numeric__best': (5,9)},
}
foo =             ('derived_string', sklearn.pipeline.Pipeline([
                ('string_aggregator', csstransformer.Aggregator([
                csstransformer.PartOfSpeech()
                ])),
              ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ]))

base_pipeline = sklearn.pipeline.Pipeline([     
    ('features', sklearn.pipeline.FeatureUnion(
          transformer_list=[
            ('status', sklearn.pipeline.Pipeline([
              ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),
                  
            ('derived_string', sklearn.pipeline.Pipeline([
                ('part_of_speech', csstransformer.PartOfSpeech()),
                ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),

            
            ('derived_numeric', sklearn.pipeline.Pipeline([
              ('numeric_aggregator', csstransformer.Aggregator([
                    csstransformer.SentenceLength(),
                    csstransformer.NumberOfWords(),
                    csstransformer.NumberOfCommas(),
                    csstransformer.NumberOfDots(),
                    csstransformer.NumberOfSemicolons(),
                    csstransformer.NumberOfColons(),
                    csstransformer.LexicalDiversity(),
                    csstransformer.AverageWordLength(),
                    csstransformer.NumberOfFunctionalWords(),
                    csstransformer.NumberOfPronouns(),
                    csstransformer.NumberOfPropnames(),
                ])), 
              ('scaler', sklearn.preprocessing.RobustScaler()),
              ('best', sklearn.decomposition.TruncatedSVD(random_state=5152)),
            ])),   
          ],
    )),
])

classifier_collection = [
        ('classifierLinearSVC', ('clf', sklearn.svm.LinearSVC(random_state = 5152))),
        ('classifierSVC', ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, kernel = "rbf", decision_function_shape = "ovr"))),
        ('classifierNB', ('nb', sklearn.naive_bayes.MultinomialNB())),
        ('classifierBNB', ('bnb', sklearn.naive_bayes.BernoulliNB()))
]

def create_full_pipelines():
    return [
        (class_name, sklearn.pipeline.Pipeline(base_pipeline.steps + [classifier]))
        for class_name, classifier in classifier_collection
    ]

pipeline_collection = create_full_pipelines()


In [5]:
for pipe_name, pipe in pipeline_collection:
    print()
    print(pipe_name)
    #print(pipe.steps)


classifierLinearSVC

classifierSVC

classifierNB

classifierBNB


In [6]:
def plot_roc_curve(test_class, y_score, name, trait):
    fpr, tpr, roc_auc = dict(), dict(), dict()
    fpr, tpr, _ = sklearn.metrics.roc_curve(test_class, y_score[:, 1], pos_label=1)
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC chart with ' + name + " algorithm and on " + trait + " dataset")
    plt.legend(loc="lower right")
    plt.show()

In [7]:
for pipe_name, pipeline in pipeline_collection:
    for reader_description, reader in data_reader_collection:

        data = reader.get_results()
        feature = data.iloc[:,0]
        label = data.iloc[:,1]
        
        grid_parameter = grid_parameter_collection[pipe_name]
        
        split = sklearn.cross_validation.train_test_split(feature, label, train_size=0.66, 
        stratify=label,random_state=5152)
        
        x_train, x_test, y_train, y_test = split
        
        grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=5, n_jobs=-1, verbose=0)

        grid_search.fit(x_train, y_train)
        y_pred_trait = grid_search.predict(x_test)
        
#         if (type(MultinomialNB()) == type(clf) or 
#         type(BernoulliNB()) == type(clf) or 
#         type(KNeighborsClassifier()) == type(clf) or 
#         type(RandomForestClassifier()) == type(clf) or 
#         type(AdaBoostClassifier()) == type(clf) or 
#         type(SVC()) == type(clf)):
#             y_score_trait = pipeline.predict_proba(x_test)
            #plot_roc_curve(y_test, y_score_trait, "Pipeline", "trait_input")

        print("\nFOR: Pipeline: %s/%s" % (reader_description, pipe_name))
#         print(sklearn.metrics.classification_report(y_test, y_pred_trait, labels=[0, 1], target_names=["0", "1"]))
#         print(sklearn.metrics.confusion_matrix(y_test, y_pred_trait, labels=[0, 1]))
        print("F1: ", sklearn.metrics.f1_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Precision: ", sklearn.metrics.average_precision_score(y_test, y_pred_trait, average='micro'))
        print("Recall: ", sklearn.metrics.recall_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_pred_trait))
        
        print()
        
        print("GRID SEARCH:")
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(grid_parameter.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))


FOR: Pipeline: cExt/classifierLinearSVC
F1:  0.48743335872
Precision:  0.608691715507
Recall:  0.447239692523
Accuracy score:  0.600830367734

GRID SEARCH:
Best score: 0.596
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cNeu/classifierLinearSVC
F1:  0.382481751825
Precision:  0.533030421302
Recall:  0.310917721519
Accuracy score:  0.623665480427

GRID SEARCH:
Best score: 0.628
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)


Process ForkPoolWorker-9:
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/pool.py", line 360, in get
    return recv()
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 416, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 383, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
split = sk.cross_validation.train_test_split(neu, neu["cNEU"], train_size = 0.66, stratify = neu["cNEU"],random_state= 5152)
train_feat_neu, test_feat_neu, train_class_neu, test_class_neu  = split
print(train_feat_neu.shape, test_feat_neu.shape,train_class_neu.shape ,test_class_neu.shape)

neu_X = train_feat_neu.append(test_feat_neu)
neu_Y = train_class_neu.append(test_class_neu)

train_feat_ext, test_feat_ext, train_class_ext, test_class_ext = sk.cross_validation.train_test_split(ext, ext["cEXT"], train_size = 0.66, stratify = ext["cEXT"],random_state= 5152)

ext_X = train_feat_ext.append(test_feat_ext)
ext_Y = train_class_ext.append(test_class_ext)

train_feat_agr, test_feat_agr, train_class_agr, test_class_agr = sk.cross_validation.train_test_split(agr, agr["cAGR"], train_size = 0.66, stratify = agr["cAGR"],random_state= 5152)

agr_X = train_feat_agr.append(test_feat_agr)
agr_Y = train_class_agr.append(test_class_agr)

train_feat_con, test_feat_con, train_class_con, test_class_con = sk.cross_validation.train_test_split(con, con["cCON"], train_size = 0.66, stratify = con["cCON"],random_state= 5152)

con_X = train_feat_con.append(test_feat_con)
con_Y = train_class_con.append(test_class_con)

train_feat_opn, test_feat_opn, train_class_opn, test_class_opn = sk.cross_validation.train_test_split(opn, opn["cOPN"], train_size = 0.66, stratify = opn["cOPN"],random_state= 5152)

opn_X = train_feat_opn.append(test_feat_opn)
opn_Y = train_class_opn.append(test_class_opn)

In [5]:
help(csstransformer)

Help on package csstransformer:

NAME
    csstransformer - # helpers

PACKAGE CONTENTS
    aggregator
    basetransformer
    lexicaldiversity
    nountransformer
    numberofdots
    partofspeech
    sentencelength
    smiley
    stemmedwords
    stemmer
    tagging
    tokenizer

FILE
    /home/dust/workspace/CaseSolvingSeminar/src/csstransformer/__init__.py


