In [None]:
#Parses .txt file to python dictionary
import codecs

f = codecs.open('data.txt', 'r', encoding='utf-8', errors='ignore')
line = f.readline()
keys = line.split('|')
#remove '/n/r'
keys[-1] = keys[-1][:-2]
data = []
while True:
    try:
        line = f.readline()
        values = line.split('|')
        msg = dict(zip(keys,values))
        #remove '/n/r'
        msg['FullText'] = msg['FullText'][:-2]
        data.append(msg.copy())
    except KeyError:
        break

import json
with open('wells-fargo-dict.json', 'w') as outfile:
    json.dump(data, outfile)

In [118]:
#read in data
import json
with open('wells-fargo-dict.json') as data_file:    
    data = json.load(data_file)
    
print('Total messages:', len(data))

Total messages: 220377


In [119]:
#include only unique messages

prune1 = []
msgs = set()
#go through every msg
for item in data:
    text = item['FullText']
    if text not in msgs:
        msgs.add(text)
        prune1.append(item)
        
print('Messages Remaining:', len(prune1))

Messages Remaining: 206491


In [120]:
#remove all tweets that cannot be attributed to a bank

bank_words = ['BankA', 'BankB', 'BankC', 'BankD', '#BankA', '#BankB', '#BankC', '#BankD', 'twit_hndl_BankA', 'twit_hndl_BankB', 'twit_hndl_BankC', 'twit_hndl_BankD']
prune2 = []
for item in prune1:
    if any(word in item['FullText'] for word in bank_words):
        prune2.append(item)
        print(item['FullText'])

print('Messages Remaining:', len(prune2))

Messages Remaining: 186498


In [121]:
#remove messages of banks responding to customers
prune3 = []
for item in prune2:
    if 'Name_Resp' not in item['FullText']:
        prune3.append(item)
        
print('Messages Remaining:',len(prune3))

Messages Remaining: 183050


In [122]:
#remove messages from employees and interview candidates
interview_words = ['interview', 'interviews', 'got the job', 'hiring at', 'hired']
prune4 = []
for item in prune3:
    if all(word not in item['FullText'] for word in interview_words):
        prune4.append(item)
print('Messages Remaining:',len(prune4))        

Messages Remaining: 182246


In [123]:
#remove messages about bank sponsored stadiums and arenas
sports_words = ['stadium', 'playoffs', 'arena', 'preseason', 'center for the arts', 'BankA center', 'BankA building', 'game day']
prune5 = []
for item in prune4:
    if all(word not in item['FullText'] for word in sports_words):
        prune5.append(item)
print('Messages Remaining:',len(prune5))

Messages Remaining: 176660


In [124]:
#remove spam messages about 'mission main street'
prune6 = []
for item in prune5:
    if 'mission main street' not in item['FullText']:
        prune6.append(item)
print('Messages Remaining:',len(prune6))

Messages Remaining: 171054


In [125]:
#remove messages for #getcollegeready contest
prune7 = []
for item in prune6:
    if 'getcollegeready' not in item['FullText']:
        prune7.append(item)
print('Messages Remaining:',len(prune7))

Messages Remaining: 166297


In [126]:
percent_remaining = 100-(len(prune7)/len(data))*100
print('Percent of tweets remaining after prune:', percent_remaining)

Percent of tweets remaining after prune: 24.539765946537074


In [127]:
with open('news-classified.json') as file:
    labeled_msgs = json.load(file)

msgs = []
for msg in labeled_msgs:
    msgs.append(msg['FullText'])

In [128]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder

with open('news-classified.json') as file:
    labeled_msgs = json.load(file)
file.close()
    
with open('news-classified2.json') as file2:
    labeled_msgs2 = json.load(file2)
file2.close()

for msg in labeled_msgs2:
    labeled_msgs.append(msg)
    
print(len(labeled_msgs))
    
print('read %d labeled tweets' % len(labeled_msgs))

label_map = {'n': 'not news', 'y': 'news'}
labels = ['news', 'not news']

for msg in labeled_msgs:
    msg['News'] = label_map[msg['News']]
    
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.transform([msg['News'] for msg in labeled_msgs])             
print('Label distribution=%s' % Counter(msg['News'] for msg in labeled_msgs).most_common(2))

400
read 400 labeled tweets
Label distribution=[('not news', 275), ('news', 125)]


In [129]:
# Tweet tokenizer.
import re
import string
def tokenize(text):
    punc_re = '[' + re.escape(string.punctuation) + ']'
    text = text.lower()
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('bank(\S+)', 'bankabcd', text)
    text = re.sub('#bank(\S+)', '#bankabcd', text)
    text = re.sub('twit_hndl_bank(\S+)', 'twit_hndl_bankabcd', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')
X = vectorizer.fit_transform(msg['FullText'] for msg in labeled_msgs)
print('Vectorized %d tweets. Found %d terms.' % (X.shape[0], X.shape[1]))
features = np.array(vectorizer.get_feature_names())

Vectorized 400 tweets. Found 1366 terms.


In [131]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_recall_fscore_support
from tabulate import tabulate

def confusion(truths, preds, labels):
    m = confusion_matrix(truths, preds)
    m = np.vstack((labels, m))
    m = np.hstack((np.matrix([''] + list(labels)).T, m))
    return tabulate(m.tolist(), headers='firstrow')

def top_coef(clf, vocab, n=10):
    if len(clf.classes_) == 2:
        coefs = [clf.coef_[0], -clf.coef_[0]]
    else:
        coefs = clf.coef_
    for li, label in enumerate(clf.classes_):
        print('\nCLASS %s' % label)
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        top_coef = coef[top_coef_ind]
        print('\n'.join(['%s\t%.3f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)]))

def do_cv(X, y, labels, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
        clf.fit(X[train], y[train])
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
    print('accuracy=%.5f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced', solver='liblinear', intercept_scaling=.3)
    clf.fit(X, y)
    return clf, truths, preds
clf, truths, preds = do_cv(X, y, label_encoder.classes_, 10)
top_coef(clf, features, 5)

accuracy=0.83250
             precision    recall  f1-score   support

       news       0.70      0.82      0.75       125
   not news       0.91      0.84      0.87       275

avg / total       0.84      0.83      0.84       400

            news    not news
--------  ------  ----------
news         103          22
not news      45         230

CLASS 0
i	1.702
my	1.396
you	1.181
twit_hndl_bankabcd	1.181
and	0.818

CLASS 1
internet	1.738
name	1.633
ly	1.394
bankabcd name	1.231
rating	1.182


In [132]:
def do_cv_thresh(X, y, labels, thresh=.5, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    probas = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(X[train], y[train])
        proba = clf.predict_proba(X[test])
        if len(probas) == 0:
            probas = proba
        else:
            probas = np.vstack((probas, proba))
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
                       
    # Now deterime best threshold for each class to maximize F1.
    thresholds = np.arange(1,21) * .05
    print(thresholds)
    for i, l in enumerate(labels):
        print('label=', l)
        for thresh in thresholds:
            newpreds = [1 if l2==i and probas[j][i] >= thresh else 0 for j, l2 in enumerate(preds)]
            newtruths = [1 if t==i else 0 for t in truths]
            #print Counter(newpreds)
            print(thresh, precision_recall_fscore_support(newtruths, newpreds, average='binary'))
    print('accuracy=%.3f' % (accuracy_score(truths, preds)))
    print(classification_report(truths, preds, target_names=labels))
    print(confusion(truths, preds, labels))
    clf = LogisticRegression(class_weight='balanced')
    clf.fit(X, y)
    return clf, truths, preds

clf, truths, preds = do_cv_thresh(X, y, label_encoder.classes_, .1, 10)

[ 0.05  0.1   0.15  0.2   0.25  0.3   0.35  0.4   0.45  0.5   0.55  0.6
  0.65  0.7   0.75  0.8   0.85  0.9   0.95  1.  ]
label= news
0.05 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.1 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.15 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.2 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.25 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.3 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.35 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.4 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.45 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.5 (0.71739130434782605, 0.79200000000000004, 0.75285171102661608, None)
0.55 (0.77477477477477474, 0.68799999999999994, 0.72881355932203395, None)
0.6 (0.81052631578947365, 0.61599999999999999,

  'precision', 'predicted', average, warn_for)


In [133]:
final_prune = prune7
X_raw = vectorizer.transform(msg['FullText'] for msg in final_prune)

In [115]:
# Relabel all unlabeled tweets.
probas_raw = clf.predict_proba(X_raw)
preds_raw = clf.predict(X_raw)

print('label distribution on messages: %s' % Counter(preds_raw).most_common(2))
print('1 is news, 0 is not news')
for msg, pred in zip(final_prune, preds_raw):
    msg['News'] = labels[pred]

label distribution on messages: [(1, 99402), (0, 67186)]
1 is news, 0 is not news


In [117]:
#read some news tweets
for msg in final_prune[300:900]:
    if msg['News'] is 'news':
        print(msg['FullText'])

. twit_hndl_BankB twit_hndl joins twit_hndl: ow. ly/ qvyws- Name bank will offer execution and clearing services
. twit_hndl_BankB_community set Name to finance preservation Name of 1,400 public housing units units under Name. f. rad plan bit. ly/ 1uqxzry
. twit_hndl says Name Name note on BankB is the finest trolling its seen on a research note: ow. ly/ rcior
. twit_hndl_BankC and BankD  Name embrace new Name service INTERNET 
. twit_hndl_BankC dangles us$ 100k carrot at Name developers bit. ly/ 1gzm8t5 INTERNET
. twit_hndl_BankC strikes$ 180m settlement with Name over allegedly misleading marketing for two hedge funds ow. ly/ r333l
. twit_hndl stresses the importance of community, leaders & business partnerships# beattheheat twit_hndl twit_hndl_BankD INTERNET
. twit_hndl chooses twit_hndl_BankC for custody: ow. ly/ qvyvq- representing assets of$ 29.1 billion
. twit_hndl hires former BankA Name cto as Name of tech product development. twit_hndl has details bit. ly/ 1njkkqe
. twit_hndl

In [95]:
#write all of the non news tweets to a file
user_tweets = []
for msg in final_prune:
    if msg['News']  is not 'news':
        user_tweets.append(msg)
        
#write to file
with open('user-tweets.json', 'w') as file:
    json.dump(user_tweets, file)  

In [None]:
#only want tweets with sentiment