In [45]:
base = '/u/ebanner/Classes/nlp/Project/irony-classifier'

In [None]:
base = '/u/npockrus/NLP/finalProject/venv/src/irony-classifier'

# Load Features

In [46]:
cd /{base}/data/conservative/features/text+sentiment+subreddit+label+progressiviness/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/features/text+sentiment+subreddit+label+progressiviness


In [47]:
import pickle
import scipy
import numpy as np

with open('features.p', 'r') as f:
    data = pickle.load(f)

xs = np.array([ sentence.encode('utf-8') for sentence in sorted(data) ])
punctuations = np.array([data[sentence]['punctuation'] for sentence in sorted(data)])
sentiments = np.array([data[sentence]['sentiment'] for sentence in sorted(data)])
subreddits = np.array([data[sentence]['subreddits'] for sentence in sorted(data)])
ys = np.array([ data[sentence]['label'] for sentence in sorted(data) ])
progressivinesses = [ data[sentence]['progressiviness'] for sentence in sorted(data) ]

# Concatenate Features

In [48]:
features = [ x + ' ' + punct + ' ' + (' SUBREDDITx'.join(subreddit_list)).encode('utf-8') for x, subreddit_list, punct in zip(xs, subreddits, punctuations)]

# Vectorize Features

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50000, binary=True, stop_words="english")
X = vectorizer.fit_transform(features)

# Tack on Progressiviness Feature

In [50]:
from scipy.sparse import coo_matrix, csr_matrix

progressivinesses = [ [progressiviness] for progressiviness in progressivinesses ]
progressivinesses = coo_matrix(progressivinesses)

X = csr_matrix(scipy.sparse.hstack([X, progressivinesses]))

# 5-Fold Test

In [51]:
from sklearn.cross_validation import KFold

kf = KFold(len(xs), n_folds=5, shuffle=True)

In [52]:
from scipy.sparse import coo_matrix

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier

recalls, precisions, f_measures = [], [], []
for train, test in kf:
    # Train svm
    svm = SGDClassifier(loss="hinge", class_weight="auto")
    parameters = { 'alpha': [.001, .01,  .1] }
    clf = GridSearchCV(svm, parameters, scoring='f1')
    clf.fit(X[train], ys[train])

    # Make predictions
    predictions = clf.predict(X[test])
    
    for test, vector, prediction, actual in zip(xs[test], X[test], predictions, ys[test]):
        print '========================================================================'
        print
        print 'Comment: {}'.format(test)
        print 'Prediction: {}'.format(prediction)
        print 'Was really: {}'.format(actual)
        print


Comment: 

&gt;Do you believe that it is morally acceptable for all gays to stop speaking to their siblings — one of the worst things a person can do to a sibling and to one’s parents — solely because the sibling believes in the man-woman definition of marriage?
Prediction: -1
Was really: -1


Comment: 

&gt;If it does fail, the effect will be historic.
Prediction: -1
Was really: -1


Comment: 

&gt;Imagine a person who opposes unwed motherhood...
Prediction: -1
Was really: -1


Comment: 

&gt;This is the signature legislative achievement of the Obama presidency, the embodiment of his new entitlement-state liberalism.
Prediction: -1
Was really: -1


Comment: 

*Edit for grammar
Prediction: -1
Was really: -1


Comment: 

11 .
Prediction: -1
Was really: -1


Comment: 

2.
Prediction: -1
Was really: -1


Comment: 

All the more reason to use it of course.
Prediction: -1
Was really: -1


Comment: 

Basically, the father is saying, "You wouldn't shit on Martin Luther King, well you're not 

# Most Indicative Negative Features

In [58]:
sgd_clf = clf.best_estimator_
feat_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(sgd_clf.coef_[0], feat_names))
for coef in coefs_with_fns:
    if coef[1].strip() == 'inthe'

going
think
subredditxscience
gt
subredditxaskreddit
subredditxpolitics
http
year
point
subredditxiama
care
subredditxworldpolitics
does
insurance
people
com
believe
new
said
liberal
person
marriage
article
job
law
public
atheism
exactly
fuck
subredditxgames
means
ve
day
called
country
congress
saying
years
making
health
subredditxbooks
worldnews
likely
shit
end
senate
long
www
claim
problem
video
agree
live
come
subredditxliberal
won
gay
use
subredditxexplainlikeimfive
subredditxbestof
subredditxleagueoflegends
best
conservatives
talking
better
debt
plans
order
edit
probably
question
let
getting
states
remember
seen
business
taxes
used
common
subredditxbattlefield3
matter
truth
family
ll
website
office
reagan
liberals
having
gov
kids
pass
votes
needs
absolutely
bit
based
subredditxprisonarchitect
subredditxsplintercell
subredditxmusic
feel
guilty
fact
certainly
regulations
happens
turn
subredditxpolandball
isn
subredditxpics
subredditxtelevision
doubt
medicare
case
federal
home
state


# Most Indicative Features

In [33]:
coefs_with_fns[-1:-20:-1]

[(1.6959439491304031, u'puncxexclamation_point'),
 (1.2368700278004183, u'yeah'),
 (1.1336414886540376, u'good'),
 (1.0655008835550484, u'news'),
 (1.0150263612595023, u'omg'),
 (1.0068853092763501, u'ones'),
 (0.95592232386181208, u'guys'),
 (0.90365677012997248, u'need'),
 (0.85187967951712718, u'subredditxatheism'),
 (0.84585530104958584, u'sidewalks'),
 (0.84585530104958584, u'shuts'),
 (0.84585530104958495, u'stories'),
 (0.81801290326720144, u'happen'),
 (0.81630328235074079, u'surprise'),
 (0.8064526094511254, u'strawman'),
 (0.80173079930089575, u'racist'),
 (0.75964156054799847, u'puncxquestion_mark'),
 (0.73749789915382102, u'damn'),
 (0.73749789915382102, u'budget')]

# Baseline (Precision=0.19, Recall=0.50, F-Measure=0.24)

In [39]:
import sklearn

precisions, recalls, f_measures = [], [], []
for train, test in kf:
    # Train svm
    svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="auto")
    parameters = { 'alpha': [.001, .01,  .1] }
    clf = GridSearchCV(svm, parameters, scoring='f1')
    clf.fit(X[train], ys[train])

    # Make predictions
    predictions = clf.predict(X[test])
    
    # Record statistics
    precision, recall, f_measure, _ = sklearn.metrics.precision_recall_fscore_support(ys[test], predictions, average='binary')
    precisions.append(precision)
    recalls.append(recall)
    f_measures.append(f_measure)
    
print 'Precisions: {}'.format(precisions)
print 'Recalls: {}'.format(recalls)
print 'F-Measures: {}'.format(f_measures)
print
print 'Mean Precision: {}'.format(np.mean(precisions))
print 'Mean Recall: {}'.format(np.mean(recalls))
print 'Mean F-Measure: {}'.format(np.mean(f_measures))

Precisions: [0.071065989847715741, 0.073891625615763554, 0.04797047970479705, 0.096774193548387094, 0.051094890510948905]
Recalls: [0.41176470588235292, 0.5357142857142857, 0.61904761904761907, 0.47368421052631576, 0.60869565217391308]
F-Measures: [0.12121212121212122, 0.12987012987012989, 0.089041095890410954, 0.16071428571428573, 0.094276094276094263]

Mean Precision: 0.0681594358455
Mean Recall: 0.529781294669
Mean F-Measure: 0.119022745393
