In [34]:
cd /v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/clean/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/clean


# Load Conservative Comments

In [35]:
import pickle
import scipy
import numpy as np

with open('conservative_comments.p', 'r') as f:
    data = pickle.load(f)

# Load comments, labels, and bow vectors
xs = [ comment for comment, label in data ]
ys = [ label for comment, label in data ]

# Add Punctuation Features

In [36]:
import re

emoticon_RE_str = '(?::|;|=)(?:-)?(?:\)|\(|D|P)'
question_mark_RE_str = '\?'
exclamation_point_RE_str = '\!'
# Any combination of multiple exclamation points and question marks
interrobang_RE_str = '[\?\!]{2,}'

for i, comment in enumerate(xs):
    if len(re.findall(r'%s' % emoticon_RE_str, comment)) > 0:
        comment = comment + " PUNCxEMOTICON"
    if len(re.findall(r'%s' % exclamation_point_RE_str, comment)) > 0:
        comment = comment + " PUNCxEXCLAMATION_POINT"
    if len(re.findall(r'%s' % question_mark_RE_str, comment)) > 0:
        comment = comment + " PUNCxQUESTION_MARK"
    if len(re.findall(r'%s' % interrobang_RE_str, comment)) > 0:
        comment = comment + " PUNCxINTERROBANG"

    if any([len(s) > 2 and str.isupper(s) for s in comment.split(" ")]):
        comment = comment + " PUNCxUPPERCASE" 

    xs[i] = comment

# Vectorize Comments

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50000, binary=True, ngram_range=(1,2), stop_words="english")
X = vectorizer.fit_transform(xs)

## Feature Names

In [39]:
vectorizer.get_feature_names()

[u'000',
 u'000 000',
 u'000 americans',
 u'000 continues',
 u'000 defense',
 u'000 federal',
 u'000 home',
 u'000 just',
 u'000 people',
 u'000 refuse',
 u'000 school',
 u'000 yay',
 u'000 year',
 u'0060977728',
 u'0060977728 amp',
 u'0060977728 ref',
 u'00pm',
 u'00pm clock',
 u'01',
 u'01 global',
 u'02',
 u'02 eye',
 u'03',
 u'03 01',
 u'03 10',
 u'03 12',
 u'03 15',
 u'03 constitution',
 u'03 embers',
 u'03 filibuster',
 u'03 lamar',
 u'04',
 u'04 25',
 u'04 swiss',
 u'040',
 u'040 yr',
 u'05',
 u'05 11',
 u'05 30',
 u'05 james',
 u'05 john',
 u'05 johnny',
 u'05 let',
 u'05 lindsey',
 u'05 puncxquestion_mark',
 u'05 saxby',
 u'050',
 u'050 yr',
 u'07',
 u'07 07',
 u'07 20',
 u'07 obama',
 u'08',
 u'08 12',
 u'08 franciscos',
 u'09',
 u'09 10',
 u'09 27',
 u'09 id',
 u'10',
 u'10 000',
 u'10 04',
 u'10 09',
 u'10 27',
 u'10 28',
 u'10 29',
 u'10 31',
 u'10 federal',
 u'10 iran',
 u'10 is_obama_creating_a_martial',
 u'10 making',
 u'10 mike',
 u'10 minutes',
 u'10 percent',
 u'10 r

## Vectorized Comments

In [40]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Pickle

In [41]:
cd /v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/features/bows+punctuation/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/features/bows+punctuation


## Comments

In [42]:
import pickle

with open('features.p', 'wb') as f:
    pickle.dump({ comment:{'label': label, 'bow':bow} for comment, bow, label in zip(xs, X, ys) }, f)

## Vectorizer

In [43]:
import pickle

with open('vectorizer.p', 'wb') as f:
    pickle.dump(vectorizer, f)