In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
import json
import numpy as np
import sklearn.metrics as metrics
import pandas as pd
from data_prep import *
import cPickle

In [2]:
coco_text = read_coco()
rom_text = read_rom()
X_text, y = build_text_dataset(coco_text, rom_text)
Xt_train, Xt_test, y_train, y_test, idx_train, idx_test = get_train_test(X_text, y)



In [3]:
ext = FeatureExtractor(count=False, hashing=True)
ext.fit(Xt_train)

In [6]:
X_train = ext.transform(Xt_train)
X_test = ext.transform(Xt_test)

In [4]:
def run_clf(clf, X_test):
    y_test_pred = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]
    return y_test_pred, y_test_prob

def eval_clf(y_test, y_test_pred, y_test_prob):
    print "Accuracy:", metrics.accuracy_score(y_test, y_test_pred)
    print "AUC:", metrics.roc_auc_score(y_test, y_test_prob)
    print "Precision:", metrics.precision_score(y_test, y_test_pred)
    print "Recall:", metrics.recall_score(y_test, y_test_pred)

In [6]:
nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred, y_prob = run_clf(nb, X_test)
eval_clf(y_test, y_pred, y_prob)

Accuracy: 0.980248380891
AUC: 0.998209245341
Precision: 0.989751405897
Recall: 0.976223389549


In [7]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV()
lr.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [8]:
y_pred, y_prob = run_clf(lr, X_test)
eval_clf(y_test, y_pred, y_prob)

Accuracy: 0.990209804297
AUC: 0.999255229229
Precision: 0.991840121707
Recall: 0.991360243296


In [11]:
cPickle.dump({"fe":ext, "clf":lr}, open("lr_cap_rom_classifier_bigram.pik", "wb"), protocol=2)

In [28]:
import  scipy.sparse as sparse
print X_train.shape
print X_test.shape
#print sparse.vstack([X_train, X_test])

(893547, 61382)
(99283, 61382)


In [71]:
import xgboost
feature_names = ["f"+str(s) for s in np.arange(X_train.shape[1])]
X_train_dm = xgboost.DMatrix(X_train, y_train, feature_names=feature_names)
xgb = xgboost.train({'objective':'binary:logistic', 'eval_metric':['logloss', 'error', 'auc'], 
                    'max_depth':4, 'gamma':0, 'scale_pos_weight':np.sum(y_train)/float(np.sum(1-y_train))}, X_train_dm)

In [72]:
print X_train.shape
print X_test.shape
print len(feature_names)

(893547, 61382)
(99283, 61382)
61382


In [73]:
print np.ones(X_test.shape[1])[None, :].shape
print X_test.shape
print type(X_test)
fake_X = sparse.vstack([np.ones(X_test.shape[1])[None, :], X_test])
fake_y = np.hstack([np.ones(1), y_test])
X_test_dm = xgboost.DMatrix(fake_X, fake_y, feature_names=feature_names)
xgb.eval(X_test_dm.slice(np.arange(1, X_test_dm.num_col())))
y_prob = xgb.predict(X_test_dm.slice(np.arange(1, X_test_dm.num_row())))
y_pred = y_prob > 0.5
print y_prob.shape, y_prob.shape, y_test.shape
eval_clf(y_test, y_pred, y_prob)

(1, 61382)
(99283, 61382)
<class 'scipy.sparse.csr.csr_matrix'>
(99283,) (99283,) (99283,)
Accuracy: 0.838753865214
AUC: 0.940536628606
Precision: 0.810828470026
Recall: 0.943495991153


In [9]:
X_text_arry = np.array(X_text)
print np.sum(y_test != y_pred)
incorrect = idx_test[y_test != y_pred]
for t, r in zip(X_text_arry[incorrect][:10], y[incorrect]):
    print r, t

1300
1 Probably in that hole-in-the-wall room of his near the train station .
1 was written in purple crayon with an adorable attempt at a soccer ball drawn in black .
1 `` The street is decorated with Christmas lights .
0 A dog deciding he wants to drive the car.
0 I Surfore looks like it has a shark fin on the top of it.
0 some bottles of cleaner in a spot in the wall
1 It 's a baseball game on television at his apartment .
1 Drinking beer and playing basketball on the weekends with his friends .
1 Bedroom with double bed , side tables , small dresser , tan rug , tan walls .
0 This the inside of a hotel room at night.


In [10]:
text_mymethod = read_mymethod()
X_mymethod = ext.transform(text_mymethod)
text_showtell = read_showtell()
X_showtell = ext.transform(text_showtell)

In [11]:
print "My errors:", np.sum(lr.predict(X_mymethod))
print "ShowTell errors:", np.sum(lr.predict(X_showtell))

My errors: 2256
ShowTell errors: 2


In [12]:
out_probas = lr.predict_proba(X_mymethod)[:, 1]
order = np.argsort(-out_probas)
for sent in text_mymethod[order[:10]]:
    print sent

the boat was docked at the pier , but there were no one in the
the boat was docked at the pier , but there were no one in the
he was nt sure how long he d hung out .
the bed was dark and the covers had a book of me .
the airport was small and private planes , but they were parked in the same
the children were close , and i was brushing my teeth and toothbrush .
i was on the beach scene , and i focus on my back on the
the horses were pulling out of the carriage , but people were already out there
i m not a person who was laying on my bed for a few feet
there were people on the boat , and the water was nt surprising .


In [13]:
out_probas = lr.predict_proba(X_showtell)[:, 1]
order = np.argsort(-out_probas)
for sent in text_showtell[order[:20]]:
    print sent

a stone slab with logs and logs
a woman with a black dress and a black tank top
a sign for a campaign to end end
a stone wall with a stone fireplace carved stone
a bowl of cereal and a bowl of soup and a bottle of water
a bottle of beer next to a glass of beer
a bowl of fruit and a glass of milk
a bowl of fruit and a glass of milk
a woman and a man smile hugging each other
a cupcake with pink and white frosting and a star
sailors control a control control being held by a man
a chair and a desk in a room
a cup of coffee sitting next to a coffee cup
a sign that shows the direction of the night
a trolley is going down the street
a cell phone and a lighter on a desk
a large metal pole with a large metal chain hanging from it
a woman and a man smile at the camera
a farmers marked filled with rip and unripe bananas
a man and a woman smile at a pose
