In [1]:
base = 'u/ebanner/Classes/nlp/Project/irony-classifier'

In [None]:
base = 'u/npockrus/NLP/finalProject/venv/src/irony-classifier'

In [2]:
cd /{base}/data/conservative/clean/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/clean


# Load Conservative Comments

In [3]:
import pickle
import scipy
import numpy as np

with open('agreement-2+.p', 'r') as f:
    data = pickle.load(f)

# Load comments, labels, and bow vectors
xs = [ comment.encode('utf-8') for comment, label in data ]
ys = [ label for comment, label in data ]

# Add Punctuation Features

In [4]:
import re

emoticon_RE_str = '(?::|;|=)(?:-)?(?:\)|\(|D|P)'
question_mark_RE_str = '\?'
exclamation_point_RE_str = '\!'
# Any combination of multiple exclamation points and question marks
interrobang_RE_str = '[\?\!]{2,}'

for i, comment in enumerate(xs):
    if len(re.findall(r'%s' % emoticon_RE_str, comment)) > 0:
        comment = comment + " PUNCxEMOTICON"
    if len(re.findall(r'%s' % exclamation_point_RE_str, comment)) > 0:
        comment = comment + " PUNCxEXCLAMATION_POINT"
    if len(re.findall(r'%s' % question_mark_RE_str, comment)) > 0:
        comment = comment + " PUNCxQUESTION_MARK"
    if len(re.findall(r'%s' % interrobang_RE_str, comment)) > 0:
        comment = comment + " PUNCxINTERROBANG"

    if any([len(s) > 2 and str.isupper(s) for s in comment.split(" ")]):
        comment = comment + " PUNCxUPPERCASE" 

    xs[i] = comment

# Vectorize Comments

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50000, binary=True, ngram_range=(1,2), stop_words="english")
X = vectorizer.fit_transform(xs)

## Feature Names

In [6]:
vectorizer.get_feature_names()

[u'000',
 u'000 000',
 u'000 continues',
 u'000 defense',
 u'000 federal',
 u'000 home',
 u'000 just',
 u'000 people',
 u'000 puncxuppercase',
 u'000 year',
 u'02',
 u'02 eye',
 u'03',
 u'04',
 u'04 25',
 u'040',
 u'040 yr',
 u'05',
 u'05 puncxuppercase',
 u'050',
 u'050 yr',
 u'07',
 u'07 20',
 u'08',
 u'08 franciscos',
 u'09',
 u'09 10',
 u'09 id',
 u'10',
 u'10 000',
 u'10 09',
 u'10 28',
 u'10 29',
 u'10 federal',
 u'10 iran',
 u'10 is_obama_creating_a_martial',
 u'10 making',
 u'10 minutes',
 u'10 percent',
 u'10 richest',
 u'10 total',
 u'10 years',
 u'100',
 u'100 53',
 u'100 federal',
 u'100 honest',
 u'100 pay',
 u'100 puncxquestion_mark',
 u'100 yearly',
 u'100k',
 u'100k know',
 u'11',
 u'11 03',
 u'11 12',
 u'11 13',
 u'11 26',
 u'11 420',
 u'11 inside',
 u'118',
 u'118 raise',
 u'11e2',
 u'11e2 9008',
 u'11was',
 u'11was inside',
 u'12',
 u'12 03',
 u'12 05',
 u'120',
 u'120 month',
 u'1200',
 u'1200 month',
 u'1297',
 u'1297 preexisting',
 u'13',
 u'13 05',
 u'13 2013',
 

## Vectorized Comments

In [7]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Pickle

In [8]:
cd /{base}/data/conservative/features/bows+punctuation/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/features/bows+punctuation


## Comments

In [9]:
import pickle

with open('features.p', 'wb') as f:
    pickle.dump({ comment:{'label': label, 'bow':bow} for comment, bow, label in zip(xs, X, ys) }, f)

## Vectorizer

In [10]:
import pickle

with open('vectorizer.p', 'wb') as f:
    pickle.dump(vectorizer, f)