## Load already prepared data

In [17]:
import pandas as pd

In [18]:
X = pd.read_csv('../valt_sa_data/x.csv')
y = pd.read_csv('../valt_sa_data/y.csv', header=None)[0]

## Split data into training and test sets

Let's perform a train/test split with 80% of the data in the training set and 20% of the data in the test set. We use `random_state=0` so that every execution yields the same result.

In [19]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
	     y,
	     test_size=0.2,
	     random_state=0)

# Train a sentiment classifier with logistic regression

We will now use logistic regression to create a sentiment classifier on the training data.

**Note:** This line may take a few minutes.

In [21]:
from sklearn import neighbors

n_neighbors = 15

clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
model = clf.fit(X_train, y_train)

# Evaluate the trained model

We will now use the cross-validation set to evaluate our model.

In [22]:
test_size = len(X_test)
tp = 0
fp = 0
fn = 0
tn = 0

print 'Prediction sample', model.predict(X_test[0])[0]

for i in range(0, test_size):
    prediction = model.predict(X_test[i])[0]
    if prediction == y_test[i]:
        if y_test[i] == 1:
            tp += 1
        else:
            tn += 1
    else:
        if prediction == 1:
            fp += 1
        else:
            fn += 1
    
precision = float(tp) / (tp + fp)
recall = float(tp) / (tp + fn)
    
print 'True positives:', tp
print 'False positives:', fp
print 'False negatives:', fn
print 'True negatives:', tn

print 'Precision:', precision
print 'Recall:', recall
print 'F-measure:', 2 * (precision * recall) / (precision + recall)
print 'Accuracy:', float(tp + tn) / test_size

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, model.predict(X_test))

print 'Confusion matrix:'
print cm

Prediction sample 1
True positives: 132
False positives: 45
False negatives: 1
True negatives: 3
Precision: 0.745762711864
Recall: 0.992481203008
F-measure: 0.851612903226
Accuracy: 0.745856353591
Confusion matrix:
[[  3  45]
 [  1 132]]
