In [5]:
import json
with open('sent-msgs.json') as file:
    user_msgs = json.load(file)  
    
print(len(user_msgs))

38861


In [18]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder

with open('sent-classfied2.json') as file:
    labeled_msgs = json.load(file)
file.close()
    
print('read %d labeled tweets' % len(labeled_msgs))

label_map = {'-': 'neg', '0': 'neutral', '+': 'pos'}
labels = ['neg', 'neutral', 'pos']

count = 0
for msg in labeled_msgs:
    count+=1
    if msg['Sent'] is '':
        msg['Sent'] = '0'
    msg['Sent'] = label_map[msg['Sent']]
        
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.transform([msg['Sent'] for msg in labeled_msgs])             
print('Label distribution=%s' % Counter(msg['Sent'] for msg in labeled_msgs).most_common(3))

read 198 labeled tweets
Label distribution=[('neg', 121), ('neutral', 57), ('pos', 20)]


In [19]:
import re
import string
def tokenize(text):
    punc_re = '[' + re.escape(string.punctuation) + ']'
    text = text.lower()
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('#bank(\S+)', '#bankabcd', text)
    text = re.sub('twit_hndl_bank(\S+)', 'twit_hndl_bankabcd', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')
X = vectorizer.fit_transform(msg['FullText'] for msg in labeled_msgs)
print('Vectorized %d tweets. Found %d terms.' % (X.shape[0], X.shape[1]))
features = np.array(vectorizer.get_feature_names())

Vectorized 198 tweets. Found 910 terms.


In [22]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_recall_fscore_support
from tabulate import tabulate

def confusion(truths, preds, labels):
    m = confusion_matrix(truths, preds)
    m = np.vstack((labels, m))
    m = np.hstack((np.matrix([''] + list(labels)).T, m))
    return tabulate(m.tolist(), headers='firstrow')

def top_coef(clf, vocab, n=10):
    if len(clf.classes_) == 2:
        coefs = [clf.coef_[0], -clf.coef_[0]]
    else:
        coefs = clf.coef_
    for li, label in enumerate(clf.classes_):
        print('\nCLASS %s' % label)
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        top_coef = coef[top_coef_ind]
        print('\n'.join(['%s\t%.3f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)]))

def do_cv(X, y, labels, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
        clf.fit(X[train], y[train])
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
    print('accuracy=%.5f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
    clf.fit(X, y)
    return clf, truths, preds
clf, truths, preds = do_cv(X, y, label_encoder.classes_, 10)
top_coef(clf, features, 15)

accuracy=0.62121
             precision    recall  f1-score   support

        neg       0.64      0.95      0.76       121
    neutral       0.57      0.14      0.23        57
        pos       0.00      0.00      0.00        20

avg / total       0.55      0.62      0.53       198

           neg    neutral    pos
-------  -----  ---------  -----
neg        115          4      2
neutral     48          8      1
pos         18          2      0

CLASS 0
of	0.586
twit_hndl_bankabcd	0.394
9	0.374
had	0.349
or	0.349
fuck	0.345
now	0.329
twit_hndl_bankabcd bank	0.327
worst	0.319
ever	0.317
a	0.313
and i	0.306
like	0.289
they	0.288
charging	0.281

CLASS 1
to	0.751
this	0.674
your bankabcd	0.665
in	0.611
be	0.602
im	0.580
to go	0.555
says	0.544
address	0.476
anniversary	0.469
god	0.465
road	0.449
home	0.424
to be	0.407
money	0.401

CLASS 2
thanks	1.113
for bankabcd	0.939
appreciate	0.938
so	0.922
my account	0.861
ass	0.838
good to	0.795
for	0.781
bad	0.767
rebankabcd appreciate	0.758
apprec

In [23]:
def do_cv_thresh(X, y, labels, thresh=.5, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    probas = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(X[train], y[train])
        proba = clf.predict_proba(X[test])
        if len(probas) == 0:
            probas = proba
        else:
            probas = np.vstack((probas, proba))
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
                       
    # Now deterime best threshold for each class to maximize F1.
    thresholds = np.arange(1,21) * .05
    print(thresholds)
    for i, l in enumerate(labels):
        print('label=', l)
        for thresh in thresholds:
            newpreds = [1 if l2==i and probas[j][i] >= thresh else 0 for j, l2 in enumerate(preds)]
            newtruths = [1 if t==i else 0 for t in truths]
            #print Counter(newpreds)
            print(thresh, precision_recall_fscore_support(newtruths, newpreds, average='binary'))
    print('accuracy=%.3f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced')
    clf.fit(X, y)
    return clf, truths, preds

clf, truths, preds = do_cv_thresh(X, y, label_encoder.classes_, .1, 10)

[ 0.05  0.1   0.15  0.2   0.25  0.3   0.35  0.4   0.45  0.5   0.55  0.6
  0.65  0.7   0.75  0.8   0.85  0.9   0.95  1.  ]
label= neg
0.05 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.1 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.15 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.2 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.25 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.3 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.35 (0.625, 0.99173553719008267, 0.7667731629392972, None)
0.4 (0.61827956989247312, 0.95041322314049592, 0.74918566775244311, None)
0.45 (0.65517241379310343, 0.78512396694214881, 0.7142857142857143, None)
0.5 (0.68918918918918914, 0.42148760330578511, 0.52307692307692299, None)
0.55 (0.60869565217391308, 0.11570247933884298, 0.19444444444444445, None)
0.6 (0.5, 0.016528925619834711, 0.032000000000000008, None)
0.65 (0.0, 0.0, 0.0, None)
0.7 (0.0, 0.0, 0.0, None)
0.75 (0.0, 0.0, 0.0, None)
0.8 (0.0, 0.0

  'precision', 'predicted', average, warn_for)


In [24]:
X_raw = vectorizer.transform(msg['FullText'] for msg in user_msgs)

In [26]:
# Relabel all unlabeled tweets.
probas_raw = clf.predict_proba(X_raw)
preds_raw = clf.predict(X_raw)

print('label distribution on messages: %s' % Counter(preds_raw).most_common(3))
print('0 is negative, 1 is neutral, 2 is positive')
for msg, pred in zip(user_msgs, preds_raw):
    msg['Sent'] = labels[pred]

label distribution on messages: [(0, 36466), (1, 2144), (2, 251)]
0 is negative, 1 is neutral, 2 is positive


In [27]:
for msg in user_msgs:
    if msg['Sent'] is 'neutral':
        print(msg['FullText'])

j. mp/ 1gfcuma# BankB closed my account for buying from coinbase & circle, is there anything that can be INTERNET
?? money be flying BankB?? literbanke strbankeht out my pockets??
. twit_hndl only employs very attractive people. i am trying to fit in. twit_hndl_BankB sunglasses are helping. INTERNET
. twit_hndl_BankA you have an antiquated online banking system plz fire whoever is working on your online banking system its an embarrassment
.. Name BankA my debit card was able to work to deposit money yesturday. but its declinded now that i need a lil today.... ill see you tommorow mr BankA... till then hppy turkey time yall!
. 500startups: break out your shades. twit_hndl_BankA# Name chief says your outlook is bright.# eyes# voice INTERNET 
. twit_hndl_BankA evidently you people think its acceptable to judge when an adult can access his own money like its my allowance
. twit_hndl_BankA simply says verify address again. wait 7-10 days. again. go get temp card from bank. again.
. twit_hnd

In [50]:
#write to file
count = 0
sent_msgs = []
for msg in user_msgs:
    if msg['Sent'] is 'pos' or msg['Sent'] is 'neg':
        sent_msgs.append(msg)
with open('sent-msgs.json', 'w') as file:
    json.dump(sent_msgs, file) 