In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data.csv', encoding='latin1', header=None)
y = pd.read_csv('labels.csv', encoding='latin1', header=None)

# tf-idf

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
def count_tokens_tfidf(data):
    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

In [4]:
X, tfidf_vec = count_tokens_tfidf(np.squeeze(data.values))

In [6]:
y = np.squeeze(y.values)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, make_scorer
from sklearn import tree, svm, neighbors, model_selection
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np

In [8]:
seed = 42
np.random.seed(seed)

In [9]:
models = []

logistic_regression_clf = LogisticRegression()
logistic_regression_clf.random_state = seed
models.append(("lr", logistic_regression_clf))

decision_tree_clf = tree.DecisionTreeClassifier()
decision_tree_clf.random_state = seed
models.append(("dt", decision_tree_clf))

mnb_clf = MultinomialNB()
mnb_clf.random_state = seed
models.append(("mnb", mnb_clf))


svm_clf = svm.SVC()
svm_clf.random_state = seed
models.append(("svm", svm_clf))

random_forest_clf = RandomForestClassifier()
random_forest_clf.random_state = seed
models.append(("rf", random_forest_clf))

knn_clf = neighbors.KNeighborsClassifier()
knn_clf.random_state = seed
models.append(("knn", knn_clf))

mlp_clf = MLPClassifier()
mlp_clf.random_state = seed
models.append(("mlp", mlp_clf))


In [11]:
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, pos_label=None, average='weighted'),
           'recall': make_scorer(recall_score, pos_label=None, average='weighted'),
           'f1_score': make_scorer(f1_score, pos_label=None, average='weighted')}


In [12]:
from sklearn import cross_validation
skf = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=seed)
skf

sklearn.cross_validation.StratifiedKFold(labels=[0 0 0 ... 0 0 0], n_folds=10, shuffle=True, random_state=42)

In [23]:
for name, model in models:
    print('modelo: ', name)
    cv_results = model_selection.cross_validate(model, X, y, cv=skf, scoring='accuracy', return_train_score=True)
    #print(cv_results.keys())
    print('fit time: ', cv_results['fit_time'])
    print('train_score: ', cv_results['train_score'])
    print('train_score medio: ', np.mean(cv_results['train_score']))
    print('test_score: ', cv_results['test_score'])
    print('test_score medio: ', np.mean(cv_results['test_score']))
    print('____________')


modelo:  lr
fit time:  [1.35489655 1.38150954 1.48669386 1.35788226 1.30762792 1.32783413
 1.31951022 1.38145828 1.41913867 1.38010859]
train_score:  [0.97645267 0.97650838 0.97617412 0.97660167 0.97606314 0.97645311
 0.97660211 0.97660211 0.97651013 0.97662198]
train_score medio:  0.976458941341992
test_score:  [0.97192982 0.97159566 0.97443609 0.97092246 0.97309492 0.97058824
 0.97242186 0.97075046 0.97057348 0.97123746]
test_score medio:  0.9717550448033396
____________
modelo:  dt
fit time:  [11.7253201  11.46882272 12.66419959 12.48525453 10.94646096 11.83573103
 11.98152852 10.92736363 11.46054602 10.86098433]
train_score:  [0.99935003 0.99931289 0.99933146 0.99931291 0.99929434 0.99927577
 0.99933149 0.99929435 0.99936865 0.99923868]
train_score medio:  0.9993110572105277
test_score:  [0.97025898 0.96775272 0.96892231 0.96958556 0.97242647 0.96925134
 0.97158616 0.96974762 0.97057348 0.96989967]
test_score medio:  0.9700004297942456
____________
modelo:  mnb
fit time:  [0.031257

KeyboardInterrupt: 