In [24]:
import json
with open('user-tweets.json') as file:
    user_msgs = json.load(file)  

In [41]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder

with open('sent-classified.json') as file:
    labeled_msgs = json.load(file)
file.close()
    
print('read %d labeled tweets' % len(labeled_msgs))

label_map = {'0': 'N/A', '-': 'neg', '+': 'pos'}
labels = ['N/A', 'neg', 'pos']

count = 0
for msg in labeled_msgs:
    if msg['Sent'] is '':
        msg['Sent'] = '0'
    msg['Sent'] = label_map[msg['Sent']]
        
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.transform([msg['Sent'] for msg in labeled_msgs])             
print('Label distribution=%s' % Counter(msg['Sent'] for msg in labeled_msgs).most_common(3))

read 400 labeled tweets
Label distribution=[('N/A', 207), ('neg', 149), ('pos', 44)]


In [42]:
import re
import string
def tokenize(text):
    punc_re = '[' + re.escape(string.punctuation) + ']'
    text = text.lower()
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('#bank(\S+)', '#bankabcd', text)
    text = re.sub('twit_hndl_bank(\S+)', 'twit_hndl_bankabcd', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')
X = vectorizer.fit_transform(msg['FullText'] for msg in labeled_msgs)
print('Vectorized %d tweets. Found %d terms.' % (X.shape[0], X.shape[1]))
features = np.array(vectorizer.get_feature_names())

Vectorized 400 tweets. Found 1564 terms.


In [44]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_recall_fscore_support
from tabulate import tabulate

def confusion(truths, preds, labels):
    m = confusion_matrix(truths, preds)
    m = np.vstack((labels, m))
    m = np.hstack((np.matrix([''] + list(labels)).T, m))
    return tabulate(m.tolist(), headers='firstrow')

def top_coef(clf, vocab, n=10):
    if len(clf.classes_) == 2:
        coefs = [clf.coef_[0], -clf.coef_[0]]
    else:
        coefs = clf.coef_
    for li, label in enumerate(clf.classes_):
        print('\nCLASS %s' % label)
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        top_coef = coef[top_coef_ind]
        print('\n'.join(['%s\t%.3f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)]))

def do_cv(X, y, labels, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
        clf.fit(X[train], y[train])
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
    print('accuracy=%.5f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
    clf.fit(X, y)
    return clf, truths, preds
clf, truths, preds = do_cv(X, y, label_encoder.classes_, 10)
top_coef(clf, features, 5)

accuracy=0.73500
             precision    recall  f1-score   support

        N/A       0.73      0.88      0.80       207
        neg       0.75      0.70      0.72       149
        pos       0.70      0.16      0.26        44

avg / total       0.73      0.73      0.71       400

       N/A    neg    pos
---  -----  -----  -----
N/A    183     22      2
neg     44    104      1
pos     24     13      7

CLASS 0
name	1.662
internet	1.138
at	1.067
at bankabcd	0.806
for	0.790

CLASS 1
i	1.084
why	0.979
atm	0.881
have	0.863
is	0.855

CLASS 2
card	1.667
thanks	1.448
in bankabcd	1.317
my	1.216
much	1.183


In [45]:
def do_cv_thresh(X, y, labels, thresh=.5, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    probas = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(X[train], y[train])
        proba = clf.predict_proba(X[test])
        if len(probas) == 0:
            probas = proba
        else:
            probas = np.vstack((probas, proba))
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
                       
    # Now deterime best threshold for each class to maximize F1.
    thresholds = np.arange(1,21) * .05
    print(thresholds)
    for i, l in enumerate(labels):
        print('label=', l)
        for thresh in thresholds:
            newpreds = [1 if l2==i and probas[j][i] >= thresh else 0 for j, l2 in enumerate(preds)]
            newtruths = [1 if t==i else 0 for t in truths]
            #print Counter(newpreds)
            print(thresh, precision_recall_fscore_support(newtruths, newpreds, average='binary'))
    print('accuracy=%.3f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced')
    clf.fit(X, y)
    return clf, truths, preds

clf, truths, preds = do_cv_thresh(X, y, label_encoder.classes_, .1, 10)

[ 0.05  0.1   0.15  0.2   0.25  0.3   0.35  0.4   0.45  0.5   0.55  0.6
  0.65  0.7   0.75  0.8   0.85  0.9   0.95  1.  ]
label= N/A
0.05 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.1 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.15 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.2 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.25 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.3 (0.71317829457364346, 0.88888888888888884, 0.79139784946236569, None)
0.35 (0.71595330739299612, 0.88888888888888884, 0.79310344827586199, None)
0.4 (0.76086956521739135, 0.84541062801932365, 0.8009153318077803, None)
0.45 (0.80208333333333337, 0.7439613526570048, 0.77192982456140358, None)
0.5 (0.88188976377952755, 0.54106280193236711, 0.6706586826347305, None)
0.55 (0.94252873563218387, 0.39613526570048307, 0.55782312925170063, None)
0.6 (0.95833333333333337, 0.22222222222222221, 0.3

  'precision', 'predicted', average, warn_for)


In [46]:
X_raw = vectorizer.transform(msg['FullText'] for msg in user_msgs)

In [47]:
# Relabel all unlabeled tweets.
probas_raw = clf.predict_proba(X_raw)
preds_raw = clf.predict(X_raw)

print('label distribution on messages: %s' % Counter(preds_raw).most_common(3))
print('0 is N/A, 1 is positive, 2 is negative')
for msg, pred in zip(user_msgs, preds_raw):
    msg['Sent'] = labels[pred]

label distribution on messages: [(0, 60541), (1, 36772), (2, 2089)]
0 is N/A, 1 is positive, 2 is negative


In [50]:
#write to file
count = 0
sent_msgs = []
for msg in user_msgs:
    if msg['Sent'] is 'pos' or msg['Sent'] is 'neg':
        sent_msgs.append(msg)
with open('sent-msgs.json', 'w') as file:
    json.dump(sent_msgs, file) 