In [70]:
import numpy as np
import pandas as pd
from patsy import dmatrices
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [71]:
# Load dataset
related =  pd.read_csv("./data_vectorised/related.csv")
infection =  pd.read_csv("./data_vectorised/infection.csv")
self =  pd.read_csv("./data_vectorised/self.csv")

## Data Exploration

In [72]:
#related.groupby('RESULT').mean()
#infection.groupby('RESULT').mean()
#self.groupby('RESULT').mean()

## Logistic Regression

In [73]:
related.head()

Unnamed: 0,index,flu,gett,swin,shot,s,nt,think,bird,sick,get,RESULT,ID
0,0,1,0,0,0,0,0,0,1,0,0,0,6004550306
1,1,1,1,0,0,0,0,0,0,0,0,0,6003455112
2,2,1,0,0,0,0,0,0,1,0,0,0,6002109706
3,3,2,0,1,0,0,0,0,0,0,0,1,5999019609
4,4,1,1,1,0,0,0,0,0,0,0,0,5990850113


In [74]:
y, X = dmatrices("RESULT ~ flu + gett + swin + shot + s + nt + think + bird + sick + get", related, return_type = 'dataframe')
# flatten y into a 1-D array
y = np.ravel(y)

In [75]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X,y)

# check the accuracy on the training set
model.score(X, y)

0.62325676784249384

## Model Evaluation Using a Validation Set

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

We now need to predict class labels for the test set. We will also generate the class probabilities, just to take a look.

In [79]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print predicted

[ 1.  0.  1. ...,  0.  1.  0.]


In [55]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print probs

[[ 0.47475099  0.52524901]
 [ 0.58802655  0.41197345]
 [ 0.41544288  0.58455712]
 ..., 
 [ 0.58802655  0.41197345]
 [ 0.40815592  0.59184408]
 [ 0.58802655  0.41197345]]


As can be seen, the classifier is predicting a 1 any time the probability in the second column is greater than 0.5.

Now let's generate some evaluation metrics.

In [56]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

0.620642515379
0.66584776784


The accuracy is 62%, which is the same as I experienced when training and predicting on the same data.
We can also see the confusion matrix and a classification report with other metrics

In [60]:
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

[[419 241]
 [314 489]]
             precision    recall  f1-score   support

        0.0       0.57      0.63      0.60       660
        1.0       0.67      0.61      0.64       803

avg / total       0.63      0.62      0.62      1463



## Model Evaluation Using Cross-Validation

Now let's try 10-fold cross-validation, to see if the accuracy holds up more rigorously.

In [61]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print scores
print scores.mean()

[ 0.65030675  0.6196319   0.56967213  0.61065574  0.62217659  0.61806982
  0.67761807  0.6036961   0.60164271  0.63244353]
0.620591333641


It's still performing at 62% accuracy'

## Predicting the Probability that a tweet is related to influenza

In [87]:
X = np.array([1,3,0,1,0,0,0,0,0,1,1])
X.reshape(-1, 1)
model.predict_proba(X)



array([[ 0.28514319,  0.71485681]])