In [1]:
import nltk
import numpy
import pandas
import sklearn
import sklearn.pipeline
import sklearn.grid_search
import sklearn.decomposition

# all 3 lines below should be uncommented
import matplotlib.pyplot
%matplotlib inline
%matplotlib notebook

import sqlite3

import csshelper
import cssfeature
import csspipe
import csstransformer


db_path = './data.sqlite3'
sqlite_connection = sqlite3.connect(db_path)



In [2]:
data_reader_collection = [
        ('cExt', csshelper.CExtReader(sqlite_connection))
    
        , ('cNeu', csshelper.CNeuReader(sqlite_connection))
     
        , ('cAgr', csshelper.CAgrReader(sqlite_connection))
    
        , ('cCon', csshelper.CConReader(sqlite_connection))
    
        , ('cOpn', csshelper.COpnReader(sqlite_connection)) 
]

In [3]:
aggregator = csstransformer.Aggregator([
        csstransformer.PartOfSpeech(),
        csstransformer.SentenceLength(),
        csstransformer.NumberOfWords(),
        csstransformer.NumberOfCommas(),
        csstransformer.NumberOfDots(),
        csstransformer.NumberOfSemicolons(),
        csstransformer.NumberOfColons(),
        csstransformer.LexicalDiversity(),
        csstransformer.AverageWordLength(),
        csstransformer.NumberOfFunctionalWords(),
        csstransformer.NumberOfPronouns(),
        csstransformer.NumberOfPropnames(),
])

data = csshelper.CExtReader(sqlite_connection).get_results()

# aggregator.transform(data.sentence)


In [40]:
grid_parameter_collection = {
#     'classifierLinearSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
#                            'features__derived_numeric__best': (5, 7,9, 10), 
#                            'random_state': 5152},
#     'classifierSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
#                      'features__derived_numeric__best': (5,7,9,10), 
#                       'random_state': 5152},
     'classifierNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                     'features__derived_numeric__best': (5, 7, 9, 10)},
#     'classifierBNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
#                      'features__derived_numeric__best': (5,7, 9, 10)},
}

base_pipeline = sklearn.pipeline.Pipeline([     
    ('features', sklearn.pipeline.FeatureUnion(
          transformer_list=[
           ('status', sklearn.pipeline.Pipeline([
              ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),
                  
            ('derived_string', sklearn.pipeline.Pipeline([
                ('part_of_speech', csstransformer.PartOfSpeech()),
                ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),
                 
                    
            ('derived_string (smileys)', sklearn.pipeline.Pipeline([
                ('smileys', csstransformer.Smileys()),
                ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer(vocabulary=csstransformer.Smileys.smileys, 
                                                                                stop_words=None)),
            ])),

            
            ('derived_numeric', sklearn.pipeline.Pipeline([
              ('numeric_aggregator', csstransformer.Aggregator([
                    csstransformer.SentenceLength(),
                    csstransformer.NumberOfWords(),
                    csstransformer.NumberOfCommas(),
                    csstransformer.NumberOfDots(),
                    csstransformer.NumberOfSemicolons(),
                    csstransformer.NumberOfColons(),
                    csstransformer.LexicalDiversity(),
                    csstransformer.AverageWordLength(),
                    csstransformer.NumberOfFunctionalWords(),
                    csstransformer.NumberOfPronouns(),
                    csstransformer.NumberOfPropnames(),
                ])), 
              ('scaler', sklearn.preprocessing.MinMaxScaler()),
              ('best', sklearn.decomposition.NMF()),
            ])),   
          ],
    )),
])

classifier_collection = [
        ('classifierLinearSVC', ('clf', sklearn.svm.LinearSVC(random_state = 5152))),
    
#         ('classifierSVC2Variante', ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, 
#                                                   kernel = "linear", decision_function_shape = "ovr", 
#                                                 probability=True))),
    
        ('classifierSVC', ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, 
                                                  kernel = "rbf", decision_function_shape = "ovr", 
                                                probability=True))),
        ('classifierNB', ('nb', sklearn.naive_bayes.MultinomialNB())),
        ('classifierBNB', ('bnb', sklearn.naive_bayes.BernoulliNB()))
]

def create_full_pipelines():
    return [
        (class_name, sklearn.pipeline.Pipeline(base_pipeline.steps + [classifier]))
        for class_name, classifier in classifier_collection
    ]

pipeline_collection = create_full_pipelines()


In [5]:
for pipe_name, pipe in pipeline_collection:
    print()
    print(pipe_name)
#     print(pipe.steps)


classifierLinearSVC

classifierSVC

classifierNB

classifierBNB


In [41]:
def plot_roc_curve(test_class, y_score, name, trait):
    fpr, tpr, roc_auc = dict(), dict(), dict()
    fpr, tpr, _ = sklearn.metrics.roc_curve(test_class, y_score[:, 1], pos_label=1)
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    matplotlib.pyplot.figure()
    matplotlib.pyplot.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    matplotlib.pyplot.plot([0, 1], [0, 1], 'k--')
    matplotlib.pyplot.xlim([0.0, 1.0])
    matplotlib.pyplot.ylim([0.0, 1.05])
    matplotlib.pyplot.xlabel('False Positive Rate')
    matplotlib.pyplot.ylabel('True Positive Rate')
    matplotlib.pyplot.title('ROC chart with ' + name + " algorithm and on " + trait + " dataset")
    matplotlib.pyplot.legend(loc="lower right")

    matplotlib.pyplot.show()

In [37]:
def apply_some_grid_search(pipe_name, pipeline, reader_description, reader):
    print("\nFOR: Pipeline: %s/%s" % (reader_description, pipe_name))

    data = reader.get_results()
    feature = data.iloc[:,0]
    label = data.iloc[:,1]

    grid_parameter = grid_parameter_collection[pipe_name]

    split = sklearn.cross_validation.train_test_split(feature, label, train_size=0.66, 
    stratify=label,random_state=5152)

    x_train, x_test, y_train, y_test = split

    grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=2, n_jobs=-1, verbose=0)

    grid_search.fit(x_train, y_train)
    y_pred_trait = grid_search.predict(x_test)


    if 'predict_proba' in dir(pipeline.steps[-1][-1]):
        print("it is in predict proba")
        y_score_trait = grid_search.predict_proba(x_test)
        plot_roc_curve(y_test, y_score_trait, "Pipeline", reader_description)

#         print(sklearn.metrics.classification_report(y_test, y_pred_trait, labels=[0, 1], target_names=["0", "1"]))
#         print(sklearn.metrics.confusion_matrix(y_test, y_pred_trait, labels=[0, 1]))
    print("F1: ", sklearn.metrics.f1_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
    print("Precision: ", sklearn.metrics.average_precision_score(y_test, y_pred_trait, average='micro'))
    print("Recall: ", sklearn.metrics.recall_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
    print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_pred_trait))

    print()

    print("GRID SEARCH:")
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(grid_parameter.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [38]:
#MNB
pipe_name , pipeline = pipeline_collection[2]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)


FOR: Pipeline: cExt/classifierNB
it is in predict proba


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

F1:  0.151851851852
Precision:  0.562323942536
Recall:  0.0859538784067
Accuracy score:  0.592526690391

GRID SEARCH:
Best score: 0.583
Best parameters set:
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cNeu/classifierNB
it is in predict proba


<IPython.core.display.Javascript object>

F1:  0.0458365164248
Precision:  0.528177883388
Recall:  0.0237341772152
Accuracy score:  0.629596678529

GRID SEARCH:
Best score: 0.626
Best parameters set:
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cAgr/classifierNB
it is in predict proba


<IPython.core.display.Javascript object>

F1:  0.694474306601
Precision:  0.759305367294
Recall:  0.901730876605
Accuracy score:  0.578588374852

GRID SEARCH:
Best score: 0.566
Best parameters set:
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierNB
it is in predict proba


<IPython.core.display.Javascript object>

F1:  0.332397003745
Precision:  0.594021329965
Recall:  0.229180116204
Accuracy score:  0.577105575326

GRID SEARCH:
Best score: 0.567
Best parameters set:
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cOpn/classifierNB
it is in predict proba


<IPython.core.display.Javascript object>

F1:  0.852961198094
Precision:  0.871810089021
Recall:  1.0
Accuracy score:  0.743772241993

GRID SEARCH:
Best score: 0.743
Best parameters set:
	features__status__tf_idf_vect__ngram_range: (1, 1)


In [7]:
#LinearSVC
pipe_name , pipeline = pipeline_collection[0]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)


FOR: Pipeline: cExt/classifierLinearSVC
F1:  0.486857142857
Precision:  0.60829610237
Recall:  0.446540880503
Accuracy score:  0.600533807829

GRID SEARCH:
Best score: 0.596
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cNeu/classifierLinearSVC
F1:  0.381322957198
Precision:  0.531838157238
Recall:  0.310126582278
Accuracy score:  0.622775800712

GRID SEARCH:
Best score: 0.630
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cAgr/classifierLinearSVC
F1:  0.616858237548
Precision:  0.715555723347
Recall:  0.629257398102
Accuracy score:  0.584816132859

GRID SEARCH:
Best score: 0.575
Best parameters set:
	features__derived_numeric__best: Truncate

In [8]:
#SVC
pipe_name , pipeline = pipeline_collection[1]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)


FOR: Pipeline: cExt/classifierSVC


NameError: name 'plot_roc_curve' is not defined

In [8]:
#BNB
pipe_name , pipeline = pipeline_collection[3]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)


FOR: Pipeline: cExt/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.390697674419
Precision:  0.588733878808
Recall:  0.293501048218
Accuracy score:  0.611506524318

GRID SEARCH:
Best score: 0.605
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cNeu/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.245859872611
Precision:  0.550512243015
Recall:  0.152689873418
Accuracy score:  0.648873072361

GRID SEARCH:
Best score: 0.641
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cAgr/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.664707323047
Precision:  0.739221805451
Recall:  0.757677275265
Accuracy score:  0.594009489917

GRID SEARCH:
Best score: 0.579
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.339568345324
Precision:  0.621685693101
Recall:  0.228534538412
Accuracy score:  0.591637010676

GRID SEARCH:
Best score: 0.592
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cOpn/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.854334854335
Precision:  0.877576297999
Recall:  0.971268954509
Accuracy score:  0.753855278766

GRID SEARCH:
Best score: 0.745
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)


In [11]:
help(csstransformer)

Help on package csstransformer:

NAME
    csstransformer - # helpers

PACKAGE CONTENTS
    aggregator
    averagewordlength
    basetransformer
    functionalwordsidentifier
    lexicaldiversity
    nouns
    numberofcolons
    numberofcommas
    numberofdots
    numberoffunctionalwords
    numberofpronouns
    numberofpropnames
    numberofsemicolons
    numberofwords
    partofspeech
    sentencelength
    smiley
    stemmedwords
    stemmer
    tagging
    tokenizer

FILE
    /home/dust/workspace/CaseSolvingSeminar/src/csstransformer/__init__.py


