In [1]:
base = 'u/ebanner/Classes/nlp/Project/irony-classifier'

In [None]:
base = 'u/npockrus/NLP/finalProject/venv/src/irony-classifier'

In [3]:
cd /{base}/data/progressive/features/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/progressive/features


# Load Conservative Comments

In [5]:
import pickle
import scipy
import numpy as np

with open('text-sentiment-label.p', 'r') as f:
    data = pickle.load(f)

# Load comments, labels, and bow vectors
xs = np.array([ comment for comment in data ])
sentiments = np.array([ data[comment]['sentiment'] for comment in data ])
ys = np.array([ data[comment]['label'] for comment in data ])

# Crunch Down the Data Just For Debugging

In [4]:
# Only take the first ten positive and negative training example
plusses = [ (x, sentiment, y) for x, sentiment, y in zip(xs, sentiments, ys) if y ==  1 ][:10]
minuses = [ (x, sentiment, y) for x, sentiment, y in zip(xs, sentiments, ys) if y == -1 ][:10]

# Extract the plusses back out
plus_xs = [ x for x, sentiment, y in plusses ]
plus_sentiments = [ sentiment for x, sentiment, y in plusses ]
plus_ys = [ y for x, sentiment, y in plusses ]

# Extract the minuses back out
minus_xs = [ x for x, sentiment, y in minuses ]
minus_sentiments = [ sentiment for x, sentiment, y in minuses ]
minus_ys = [ y for x, sentiment, y in minuses ]

# Put everything back together so we have ten plusses followed by ten minuses
xs = np.array(plus_xs + minus_xs)
ys = np.array(plus_ys + minus_ys)
sentiments = np.array(plus_sentiments + minus_sentiments)

# Split into Train and Test Sets

In [6]:
from sklearn.cross_validation import KFold

kf = KFold(len(xs), n_folds=3, shuffle=True)

# Build N-Gram Language Model on the Train Data and Train a Classifier

In [7]:
cd /{base}/lib/berkeleylm-1.1.5/examples

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/lib/berkeleylm-1.1.5/examples


In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize

import sklearn

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier


precisions, recalls, f_measures = [], [], []
for train, test in kf:
#     print train, test
#     # Filter down to just genuine comments and tokenize them
#     genuine_sentences = [ x for x, y in zip(xs[train], ys[train]) if y == -1 ]
#     genuine_sentences = [ ' '.join(word_tokenize(genuine_sentence)) for genuine_sentence in genuine_sentences ]

    genuine_sentences = xs[train]
    
    # Write the genuine tokenized comments to disk so the Berkeley N-Gram Language model can be trained
    with open('genuine-progressive.txt', 'w') as f:
        for genuine_sentence in genuine_sentences:
            f.write(genuine_sentence.encode('utf-8') + '\n')
    
    # Make an arpa straight from text
    !java -ea -mx1000m -server -cp ../src edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText 5 genuine-progressive.arpa genuine-progressive.txt
    print 'Finish building progressive ARPA from text!'
    
    # Make a binary from the arpa
    !java -ea -mx1000m -server -cp ../src edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa genuine-progressive.arpa genuine-progressive.binary
    
    # Extract train probabilities to train a classifier
    probs = [0]*len(xs[train])
    print 'Must get to: {}'.format(len(xs[train]))
    for i, sentence in enumerate(xs[train]):
        print i
        sentence = ' '.join(word_tokenize(sentence))
        out = !echo "{sentence}" | java -ea -mx1000m -server -cp ../src edu.berkeley.nlp.lm.io.ComputeLogProbabilityOfTextStream genuine-progressive.binary 2>&1 | tail -n 1
        print out
        prob = float(out[0].split()[5])
        probs[i] = prob
    print 'Finish extracting conservativiness for training!'
    
    # Train a simple classifier that just uses sentiment*probabilities
    svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="auto")
    parameters = { 'alpha': [.001, .01,  .1] }
    clf = GridSearchCV(svm, parameters, scoring='f1')
    conservativinesses = np.array([ [prob*sentiment] for prob, sentiment in zip(probs, sentiments[train]) ])
    clf.fit(conservativinesses, ys[train])
    print 'Trained the classifier on conservativiness!'
    
    # Extract the probabilities for the test set
    probs = [0]*len(xs[test])
    print 'Must get to: {}'.format(len(xs[test]))
    for i, sentence in enumerate(xs[test]):
        print i
        sentence = ' '.join(word_tokenize(sentence))
        out = !echo "{sentence}" | java -ea -mx1000m -server -cp ../src edu.berkeley.nlp.lm.io.ComputeLogProbabilityOfTextStream progressive.binary 2>&1 | tail -n 1
        prob = float(out[0].split()[5])
        probs[i] = prob
    print 'Finish extracting conservativiness for test!'
    
    # Make predictions
    conservativinesses = np.array([ [prob*sentiment] for prob, sentiment in zip(probs, sentiments[test]) ])
    predictions = clf.predict(conservativinesses)
    
    # Record statistics
    precision, recall, f_measure, _ = sklearn.metrics.precision_recall_fscore_support(ys[test], predictions, average='binary')
    precisions.append(precision)
    recalls.append(recall)
    f_measures.append(f_measure)
    
    print 'Precision: {}'.format(precision)
    print 'Recall: {}'.format(precision)
    print 'F-Measure: {}'.format(precision)
    
print 'Precisions: {}'.format(precisions)
print 'Recalls: {}'.format(recalls)
print 'F-Measures: {}'.format(f_measures)
print
print 'Mean Precision: {}'.format(np.mean(precisions))
print 'Mean Recall: {}'.format(np.mean(recalls))
print 'Mean F-Measure: {}'.format(np.mean(f_measures))

Reading text files [genuine-progressive.txt] and writing to file genuine-progressive.arpa {
	Reading in ngrams from raw text {
		On line 0
	} [0s]
	Writing Kneser-Ney probabilities {
		Counting counts for order 0 {
		} [0s]
		Counting counts for order 1 {
		} [0s]
		Counting counts for order 2 {
		} [0s]
		Counting counts for order 3 {
		} [0s]
		Counting counts for order 4 {
		} [0s]
		Writing ARPA {
			On order 1
			Writing line 1
			On order 2
			Writing line 1
			Writing line 10001
			Writing line 20001
			On order 3
			Writing line 1
			Writing line 10001
			Writing line 20001
			On order 4
			Writing line 1
			Writing line 10001
			Writing line 20001
			On order 5
			Writing line 1
			Writing line 10001
			Writing line 20001
		} [0s]
	} [0s]
Finish building progressive ARPA from text!
Reading Lm File genuine-progressive.arpa . . .  {
	Counting values {
		Parsing ARPA language model file {
			Reading 1-grams {
				Read 0 lines
				6908 1-gram read.
			} [0s]
			Reading 2-grams {
	

In [36]:
sgd_clf = clf.best_estimator_
sgd_clf.coef_[0]

array([ 0.09327809])

# Mean Precision: 0.0534009998485
# Mean Recall: 0.65406162465
# Mean F-Measure: 0.0902528618887