In [78]:
import nltk
import numpy
import pandas
import sklearn
import sklearn.pipeline
import sklearn.grid_search

# import matplotlib.pyplot as plt
# %matplotlib inline
# %matplotlib notebook

import sqlite3

import csshelper
import cssfeature
import csspipe
import csstransformer


db_path = './data.sqlite3'
sqlite_connection = sqlite3.connect(db_path)

In [79]:
data_reader_collection = [
        ('cExt', csshelper.CExtReader(sqlite_connection))
    
        , ('cNeu', csshelper.CNeuReader(sqlite_connection))
     
        , ('cAgr', csshelper.CAgrReader(sqlite_connection))
    
        , ('cCon', csshelper.CConReader(sqlite_connection))
    
        , ('cOpn', csshelper.COpnReader(sqlite_connection)) 
]

In [80]:
aggregator = csstransformer.Aggregator([
        csstransformer.PartOfSpeech(),
        csstransformer.SentenceLength(),
        csstransformer.NumberOfWords(),
        csstransformer.NumberOfCommas(),
        csstransformer.NumberOfDots(),
        csstransformer.NumberOfSemicolons(),
        csstransformer.NumberOfColons(),
        csstransformer.LexicalDiversity(),
        csstransformer.AverageWordLength(),
        csstransformer.NumberOfFunctionalWords(),
        csstransformer.NumberOfPronouns(),
        csstransformer.NumberOfPropnames(),
])

data = csshelper.CExtReader(sqlite_connection).get_results()

# aggregator.transform(data.sentence)


In [81]:
pipeline_collection = [
    ('classifierLinearSVC', sklearn.pipeline.Pipeline([
        ('vectorizer_tfidf', sklearn.feature_extraction.text.TfidfVectorizer()),
        ('clf', sklearn.svm.LinearSVC(random_state = 5152))]) # OneVsRestClassifier already implemented, docs: http://scikit-learn.org/stable/modules/svm.html#multi-class-classification
    )
    , ('classifierSVC', sklearn.pipeline.Pipeline([
        ('vectorizer_tfidf', sklearn.feature_extraction.text.TfidfVectorizer()),
        ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, kernel = "rbf", decision_function_shape = "ovr"))]) # "one-against-one", docs: http://scikit-learn.org/stable/modules/svm.html#multi-class-classification
    )
    # 2 Naive Bayes
    , ('classifierNB', sklearn.pipeline.Pipeline([
        ('vectorizer_tfidf', sklearn.feature_extraction.text.TfidfVectorizer()),
        ('nb', sklearn.naive_bayes.MultinomialNB())])
    )

    , ('classifierBNB', sklearn.pipeline.Pipeline([    
        ('vectorizer_tfidf', sklearn.feature_extraction.text.TfidfVectorizer()),
        ('bnb', sklearn.naive_bayes.BernoulliNB())])
    )
]

grid_parameter_collection = {
    'classifierLinearSVC': { 'vectorizer_tfidf__ngram_range': ((1, 1), (1, 2))},
    'classifierSVC': { 'vectorizer_tfidf__ngram_range': ((1, 1), (1, 2))},
    'classifierNB': { 'vectorizer_tfidf__ngram_range': ((1, 1), (1, 2))},
    'classifierBNB': { 'vectorizer_tfidf__ngram_range': ((1, 1), (1, 2))},
}

In [82]:
def plot_roc_curve(test_class, y_score, name, trait):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, roc_auc = dict(), dict(), dict()
    fpr, tpr, _ = sklearn.metrics.roc_curve(test_class, y_score[:, 1], pos_label=1)
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC chart with ' + name + " algorithm and on " + trait + " dataset")
    plt.legend(loc="lower right")
    plt.show()

In [83]:
for pipe_name, pipeline in pipeline_collection:
    for reader_description, reader in data_reader_collection:

        data = reader.get_results()
        feature = data.iloc[:,0]
        label = data.iloc[:,1]
        
        grid_parameter = grid_parameter_collection[pipe_name]
        
        split = sklearn.cross_validation.train_test_split(feature, label, train_size=0.66, 
        stratify=label,random_state=5152)
        
        x_train, x_test, y_train, y_test = split
        
        grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=5, n_jobs=-1, verbose=0)

        grid_search.fit(x_train, y_train)
        y_pred_trait = grid_search.predict(x_test)

#         y_score_trait = pipeline.predict_proba(x_test)
        #plot_roc_curve(y_test, y_score_trait, "Pipeline", "trait_input")

        print("\nFOR: Pipeline: %s/%s" % (reader_description, pipe_name))
#         print(sklearn.metrics.classification_report(y_test, y_pred_trait, labels=[0, 1], target_names=["0", "1"]))
#         print(sklearn.metrics.confusion_matrix(y_test, y_pred_trait, labels=[0, 1]))
        print("F1: ", sklearn.metrics.f1_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Precision: ", sklearn.metrics.average_precision_score(y_test, y_pred_trait, average='micro'))
        print("Recall: ", sklearn.metrics.recall_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_pred_trait))
        
        print()
        
        print("GRID SEARCH:")
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(grid_parameter.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))


FOR: Pipeline: cExt/classifierLinearSVC
F1:  0.480608365019
Precision:  0.602853241072
Recall:  0.441649196366
Accuracy score:  0.594899169632

GRID SEARCH:
Best score: 0.599
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 2)

FOR: Pipeline: cNeu/classifierLinearSVC
F1:  0.386200287494
Precision:  0.531919562086
Recall:  0.318829113924
Accuracy score:  0.620106761566

GRID SEARCH:
Best score: 0.628
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 2)

FOR: Pipeline: cAgr/classifierLinearSVC
F1:  0.61811238407
Precision:  0.716005283038
Recall:  0.632607481854
Accuracy score:  0.584816132859

GRID SEARCH:
Best score: 0.576
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierLinearSVC
F1:  0.544406779661
Precision:  0.656397341702
Recall:  0.518398967076
Accuracy score:  0.601423487544

GRID SEARCH:
Best score: 0.588
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 2)

FOR: Pipeline: cOpn/classifierLinearSVC
F1:  0.838240639

  'precision', 'predicted', average, warn_for)



FOR: Pipeline: cNeu/classifierSVC
F1:  0.0
Precision:  0.687425860024
Recall:  0.0
Accuracy score:  0.625148279953

GRID SEARCH:
Best score: 0.625
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)


  'precision', 'predicted', average, warn_for)



FOR: Pipeline: cAgr/classifierSVC
F1:  0.693782684486
Precision:  0.765569395018
Recall:  1.0
Accuracy score:  0.531138790036

GRID SEARCH:
Best score: 0.531
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierSVC
F1:  0.0
Precision:  0.729685646501
Recall:  0.0
Accuracy score:  0.540628706999

GRID SEARCH:
Best score: 0.541
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)


  'precision', 'predicted', average, warn_for)



FOR: Pipeline: cOpn/classifierSVC
F1:  0.852670976523
Precision:  0.871589561091
Recall:  1.0
Accuracy score:  0.743179122183

GRID SEARCH:
Best score: 0.743
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cExt/classifierNB
F1:  0.271685761047
Precision:  0.571970492888
Recall:  0.174004192872
Accuracy score:  0.60409252669

GRID SEARCH:
Best score: 0.606
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cNeu/classifierNB
F1:  0.0946045824095
Precision:  0.562802960554
Recall:  0.0506329113924
Accuracy score:  0.636714116251

GRID SEARCH:
Best score: 0.635
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cAgr/classifierNB
F1:  0.674471992654
Precision:  0.744207006535
Recall:  0.820212171971
Accuracy score:  0.579478054567

GRID SEARCH:
Best score: 0.583
Best parameters set:
	vectorizer_tfidf__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierNB
F1:  0.430369127517
Precision:  0.626394363265
Recall:  0.3

In [None]:
split = sk.cross_validation.train_test_split(neu, neu["cNEU"], train_size = 0.66, stratify = neu["cNEU"],random_state= 5152)
train_feat_neu, test_feat_neu, train_class_neu, test_class_neu  = split
print(train_feat_neu.shape, test_feat_neu.shape,train_class_neu.shape ,test_class_neu.shape)

neu_X = train_feat_neu.append(test_feat_neu)
neu_Y = train_class_neu.append(test_class_neu)

train_feat_ext, test_feat_ext, train_class_ext, test_class_ext = sk.cross_validation.train_test_split(ext, ext["cEXT"], train_size = 0.66, stratify = ext["cEXT"],random_state= 5152)

ext_X = train_feat_ext.append(test_feat_ext)
ext_Y = train_class_ext.append(test_class_ext)

train_feat_agr, test_feat_agr, train_class_agr, test_class_agr = sk.cross_validation.train_test_split(agr, agr["cAGR"], train_size = 0.66, stratify = agr["cAGR"],random_state= 5152)

agr_X = train_feat_agr.append(test_feat_agr)
agr_Y = train_class_agr.append(test_class_agr)

train_feat_con, test_feat_con, train_class_con, test_class_con = sk.cross_validation.train_test_split(con, con["cCON"], train_size = 0.66, stratify = con["cCON"],random_state= 5152)

con_X = train_feat_con.append(test_feat_con)
con_Y = train_class_con.append(test_class_con)

train_feat_opn, test_feat_opn, train_class_opn, test_class_opn = sk.cross_validation.train_test_split(opn, opn["cOPN"], train_size = 0.66, stratify = opn["cOPN"],random_state= 5152)

opn_X = train_feat_opn.append(test_feat_opn)
opn_Y = train_class_opn.append(test_class_opn)

In [5]:
help(csstransformer)

Help on package csstransformer:

NAME
    csstransformer - # helpers

PACKAGE CONTENTS
    aggregator
    basetransformer
    lexicaldiversity
    nountransformer
    numberofdots
    partofspeech
    sentencelength
    smiley
    stemmedwords
    stemmer
    tagging
    tokenizer

FILE
    /home/dust/workspace/CaseSolvingSeminar/src/csstransformer/__init__.py


