In [2]:
cd /u/npockrus/NLP/finalProject/venv/src/irony-classifier/

/v/filer4b/v20q001/npockrus/NLP/finalProject/venv/src/irony-classifier


# Bag of Words Classifier

## Load Comments 

In [5]:
import json
import numpy as np
import sklearn

with open('data/comments.json', 'r') as comments_f:
    examples = json.load(comments_f)
examples = [ (x, y) for x, y in examples.items() ]

In [6]:
# Create arrays
xs = np.array([ x for x, y in examples ])
ys = np.array([ y for x, y in examples ])

## Vectorize Comments

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50000, binary=True, ngram_range=(1,2), stop_words="english")
X = vectorizer.fit_transform(xs)

### Feature Names

In [8]:
vectorizer.get_feature_names()

[u'admitted',
 u'admitted puncxuppercase',
 u'arguments',
 u'arguments shills',
 u'atheist',
 u'away',
 u'benefit',
 u'benefit financially',
 u'bitch',
 u'bitch heat',
 u'blow',
 u'blow away',
 u'butt',
 u'close',
 u'close racist',
 u'consumers',
 u'consumers benefit',
 u'craft',
 u'deceive',
 u'deceive consumers',
 u'democrats',
 u'democrats butt',
 u'does',
 u'does sound',
 u'drying',
 u'drying blow',
 u'exhaust',
 u'exhaust stream',
 u'expands',
 u'expands wider',
 u'far',
 u'far wider',
 u'financially',
 u'fine',
 u'fine institutions',
 u'finish',
 u'finish drying',
 u'fox',
 u'funny',
 u'funny arguments',
 u'going',
 u'going kick',
 u'gop',
 u'gop admitted',
 u'heat',
 u'heat lamp',
 u'high',
 u'high pressure',
 u'higher',
 u'higher look',
 u'http',
 u'http www',
 u'huh',
 u'huh puncxquestion_mark',
 u'insane',
 u'insane like',
 u'institutions',
 u'institutions stoop',
 u'kick',
 u'kick democrats',
 u'lamp',
 u'lamp finish',
 u'law',
 u'law going',
 u'level',
 u'level deceive',
 u

### Vectorized Comments

In [9]:
X.toarray()

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## 5-Fold Test

In [10]:
from sklearn.cross_validation import KFold

kf = KFold(len(xs), n_folds=5, shuffle=True)

In [11]:
for train, test in kf:
    print(train, test)

(array([0, 2, 3, 5, 6, 7, 8, 9]), array([1, 4]))
(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))
(array([1, 2, 3, 4, 6, 7, 8, 9]), array([0, 5]))
(array([0, 1, 2, 3, 4, 5, 6, 9]), array([7, 8]))
(array([0, 1, 2, 3, 4, 5, 7, 8]), array([6, 9]))


In [12]:
for train, test in kf:
    # Get training and test data for this round
    xs_train, xs_test = X[train], X[test]
    ys_train, ys_test = ys[train], ys[test]

    print(xs_train, xs_test)
    print(ys_train, ys_test)

(<8x106 sparse matrix of type '<type 'numpy.int64'>'
	with 95 stored elements in Compressed Sparse Row format>, <2x106 sparse matrix of type '<type 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>)
(array([-1,  1,  1, -1,  1, -1,  1, -1]), array([-1,  1]))
(<8x106 sparse matrix of type '<type 'numpy.int64'>'
	with 71 stored elements in Compressed Sparse Row format>, <2x106 sparse matrix of type '<type 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>)
(array([-1, -1,  1, -1,  1, -1,  1, -1]), array([1, 1]))
(<8x106 sparse matrix of type '<type 'numpy.int64'>'
	with 80 stored elements in Compressed Sparse Row format>, <2x106 sparse matrix of type '<type 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>)
(array([-1,  1,  1,  1,  1, -1,  1, -1]), array([-1, -1]))
(<8x106 sparse matrix of type '<type 'numpy.int64'>'
	with 81 stored elements in Compressed Sparse Row format>, <2x106 sparse matrix of type '<type 'numpy.i

### Predictions Made by SVM

In [13]:
recalls, precisions, f_measures = [], [], []
for train, test in kf:
    # Get training and test data for this round
    Xs_train, Xs_test = X[train], X[test]
    ys_train, ys_test = ys[train], ys[test]

    # Train svm
    svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="auto", alpha=.01)
    parameters = { 'alpha': [.001, .01,  .1] }
    clf = GridSearchCV(svm, parameters, scoring='f1')
    clf.fit(xs_train, ys_train)

    # Make predictions
    predictions = clf.predict(Xs_test)
    
    for test, vector, prediction, actual in zip(xs[test], xs_test, predictions, ys_test):
        print '========================================================================'
        print
        print 'Comment: {}'.format(test.encode('ascii', 'ignore'))
        print 'Vectorized as: {}'.format(vector)
        print 'Prediction: {}'.format(prediction)
        print 'Was really: {}'.format(actual)
        print

NameError: name 'SGDClassifier' is not defined

## Performance

In [71]:
import sklearn

for train, test in kf:
    # Get training and test data for this round
    Xs_train, Xs_test = X[train], X[test]
    ys_train, ys_test = ys[train], ys[test]

    # Train svm
    svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="auto", alpha=.01)
    parameters = { 'alpha': [.001, .01,  .1] }
    clf = GridSearchCV(svm, parameters, scoring='f1')
    clf.fit(xs_train, ys_train)

    # Make predictions
    predictions = clf.predict(Xs_test)
    
    precision, recall, f_measure, _ = sklearn.metrics.precision_recall_fscore_support(ys_test, predictions, average='binary')
    
    print 'Prediction/Actuals: {}'.format(zip(predictions, ys_test))
    print 'Precision:   {}'.format(precision)
    print 'Recall:      {}'.format(recall)
    print 'F-measure:   {}'.format(f_measure)

Prediction/Actuals: [(1, 1), (-1, -1)]
Precision:   1.0
Recall:      1.0
F-measure:   1.0
Prediction/Actuals: [(1, -1), (-1, 1)]
Precision:   0.0
Recall:      0.0
F-measure:   0.0
Prediction/Actuals: [(1, 1), (-1, 1)]
Precision:   1.0
Recall:      0.5
F-measure:   0.666666666667
Prediction/Actuals: [(-1, 1), (1, -1)]
Precision:   0.0
Recall:      0.0
F-measure:   0.0
Prediction/Actuals: [(-1, -1), (1, -1)]
Precision:   0.0
Recall:      0.0
F-measure:   0.0
