In [67]:
import numpy as np
import pandas as pd
from patsy import dmatrices
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [54]:
# Load dataset
related =  pd.read_csv("./data_vectorised/related.csv")
infection =  pd.read_csv("./data_vectorised/infection.csv")
self =  pd.read_csv("./data_vectorised/self.csv")

## Data Exploration

## Logistic Regression

In [55]:
infection.head()

Unnamed: 0,index,flu,gett,think,sick,feel,bett,hop,got,lik,...,good,cold,work,today,back,well,cough,bad,RESULT,ID
0,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6004550306
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6003455112
2,2,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,6002109706
3,3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5999019609
4,4,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5990850113


In [56]:
def logistic_regression(params, data):
    y, X = dmatrices(params, data, return_type='dataframe')
    # flatten y into a 1-D array
    y = np.ravel(y)
    # instantiate a logistic regression model, and fit with X and y
    model = LogisticRegression()
    model = model.fit(X,y)

    # check the accuracy on the training set
    model.score(X, y)


In [57]:
related_params = "RESULT ~ flu + gett + shot + think + sick + get + worried + feel + go + h1n1 + lik + got + scared + hop + bett + worry + fear + vaccin + today + one + still + need + cold + really + hom"
infection_params = "RESULT ~ flu + gett + think + sick + feel + bett + hop + got + lik + go + im + get + worried + day + week + still + hom + good + cold + work + today + back + well + cough + bad"
self_params = "RESULT ~ flu + gett + shot + think + sick + feel + get + got + go + lik + im + bett + hop + worried + today + still + day + scared + week + vaccin + good + cold + worry + work + back"

## Related

In [58]:
y, X = dmatrices(related_params, related, return_type='dataframe')
# flatten y into a 1-D array
y = np.ravel(y)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X,y)
# check the accuracy on the training set
model.score(X, y)

0.68929450369155043

## Infection

In [59]:
y2, X2 = dmatrices(infection_params, infection, return_type='dataframe')
# flatten y into a 1-D array
y2 = np.ravel(y2)
# instantiate a logistic regression model, and fit with X and y
model2 = LogisticRegression()
model2 = model2.fit(X2,y2)
# check the accuracy on the training set
model2.score(X2, y2)

0.56774642981539536

## Self

In [60]:
y3, X3 = dmatrices(self_params, self, return_type='dataframe')
# flatten y into a 1-D array
y3 = np.ravel(y3)
# instantiate a logistic regression model, and fit with X and y
model3 = LogisticRegression()
model3 = model3.fit(X3,y3)
# check the accuracy on the training set
model2.score(X3, y3)

0.51110104213864971

## (Related)Model Evaluation Using a Validation Set

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model_test = LogisticRegression()
model_test.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

We now need to predict class labels for the test set. We will also generate the class probabilities, just to take a look.

In [62]:
# predict class labels for the test set
predicted = model_test.predict(X_test)
print predicted

[ 0.  0.  1. ...,  0.  1.  0.]


In [63]:
# generate class probabilities
probs = model_test.predict_proba(X_test)
print probs

[[ 0.58405944  0.41594056]
 [ 0.58405944  0.41594056]
 [ 0.11408488  0.88591512]
 ..., 
 [ 0.53728873  0.46271127]
 [ 0.1600838   0.8399162 ]
 [ 0.681325    0.318675  ]]


As can be seen, the classifier is predicting a 1 any time the probability in the second column is greater than 0.5.

Now let's generate some evaluation metrics.

In [69]:
# generate evaluation metrics
print '------------------ RELATED -------------------------------'
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

------------------ RELATED -------------------------------
0.682843472317
0.736045699838


The accuracy is 68%, which is the same as I experienced when training and predicting on the same data.
We can also see the confusion matrix and a classification report with other metrics

In [65]:
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

[[483 177]
 [287 516]]
             precision    recall  f1-score   support

        0.0       0.63      0.73      0.68       660
        1.0       0.74      0.64      0.69       803

avg / total       0.69      0.68      0.68      1463



## Model Evaluation Using Cross-Validation

Now let's try 10-fold cross-validation, to see if the accuracy holds up more rigorously.

In [68]:
# Evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print scores
print scores.mean()

[ 0.68711656  0.6809816   0.63729508  0.64959016  0.7063655   0.67145791
  0.75154004  0.69199179  0.69815195  0.70431211]
0.687880270726


It's still performing at 68% accuracy'

## Predicting the Probability that a tweet is related to influenza

In [70]:
#X = np.array([1,3,0,1,0,0,0,0,0,1,1])
#X.reshape(-1, 1)
#model.predict_proba(X)

## (Infection)Model Evaluation Using a Validation Set

In [78]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0)
model_test2 = LogisticRegression()
model_test2.fit(X_train2, y_train2)
# predict class labels for the test set
predicted2 = model_test.predict(X_test2)
# generate class probabilities
probs = model_test.predict_proba(X_test2)
# generate evaluation metrics
print '------------------ RELATED -------------------------------'
print 'Accuracy Score:'
print metrics.accuracy_score(y_test2, predicted2)
print 'roc score:'
print metrics.roc_auc_score(y_test2, probs[:, 1])
print metrics.confusion_matrix(y_test2, predicted2)
print metrics.classification_report(y_test2, predicted2)
# Evaluate the model using 10-fold cross-validation
scores2 = cross_val_score(LogisticRegression(), X2, y2, scoring='accuracy', cv=10)
print scores2
print scores2.mean()

------------------ RELATED -------------------------------
Accuracy Score:
0.452436194896
roc score:
0.45183992138
[[118 364]
 [108 272]]
             precision    recall  f1-score   support

        0.0       0.52      0.24      0.33       482
        1.0       0.43      0.72      0.54       380

avg / total       0.48      0.45      0.42       862

[ 0.55555556  0.48780488  0.57491289  0.58536585  0.57491289  0.57142857
  0.55749129  0.52264808  0.50174216  0.53658537]
0.546844754162


In [None]:
(Self)Model Evaluation Using a Validation Set