In [33]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from collections import defaultdict
import csv
import datetime
import html
import itertools
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import casual_tokenize
import numpy as np
import os
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [2]:
TWEET_PATH = 'deep_learning/data/city_tweets/'
NUM_MONTHS = 35

In [3]:
def get_date(d):
    year = int(d[:4])
    month = int(d[5:7])
    day = int(d[8:10])
    return datetime.date(year, month, day)

In [32]:
def get_features(tweet_file, food_name, city_month_features):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = []
    food_cnt = np.zeros(NUM_MONTHS, dtype=np.float32)
    month_cnt = np.zeros(NUM_MONTHS, dtype=np.int32)
    for i in range(NUM_MONTHS):
        sentiment_scores.append(defaultdict(float))
        
    city_month = tweet_file[:tweet_file.find("_tweets")]
    print("Processing:", tweet_file)
    with open(TWEET_PATH + tweet_file) as csvfile:
        reader = csv.DictReader(csvfile)
        cnt = 0
        for row in reader:
            date = get_date(row['postedTime'])
            month_index = (date.year - 2014)*12 + (date.month - 1)
            tweet = ' '.join([word for word in casual_tokenize(row['tweet']) if '@' not in word and 'http' not in word])
            scores = sid.polarity_scores(tweet)
            for k, v in scores.items():
                sentiment_scores[month_index][k] += v
            if food_name in tweet.lower():
                food_cnt[month_index] += 1
            month_cnt[month_index] += 1
            cnt += 1
            if (cnt % 200000 == 0): 
                print(str(cnt),'tweets processed...')
    
    for i in range(NUM_MONTHS):
        if month_cnt[i] != 0:
            cur_feat = [food_cnt[i]/month_cnt[i]]
            for k in sentiment_scores[i].keys():
                cur_feat.append(sentiment_scores[i][k]/month_cnt[i])
            city_month_features[city_month + "_" + str(i)] = np.array(cur_feat)

In [5]:
def get_labels(path):
    city_month_labels = {}
    for filename in os.listdir(path):
        city_month = filename[:filename.find("_batch")]
        with open(path + filename, "r") as batchf:
            for batch in batchf:
                city_month_labels[city_month] = np.array([int(x) for x in batch.split(',')[0:2]])
                break
    return city_month_labels

In [6]:
def write_file(features, labels, filename):
    f = open(filename, "w")
    for i in range(features.shape[0]):
        for j in range(features.shape[1]):
            f.write("%.10f " % features[i][j])
        f.write('%d %d\n' % (labels[i][0], labels[i][1]))
    f.close()

In [7]:
def aggregate_features_and_labels(city_month_features):
    city_month_labels_train = get_labels('deep_learning/data/batches_train/')
    city_month_labels_eval = get_labels('deep_learning/data/batches_val/')
    train_features, eval_features = [], []
    train_labels, eval_labels = [], []
    for city_month in city_month_labels_train.keys():
        if city_month in city_month_features:
            train_features.append(city_month_features[city_month])
            train_labels.append(city_month_labels_train[city_month])
    for city_month in city_month_labels_eval.keys():
        if city_month in city_month_features:
            eval_features.append(city_month_features[city_month])
            eval_labels.append(city_month_labels_eval[city_month])
    train_features = np.array(train_features)
    eval_features = np.array(eval_features)
    train_labels = np.array(train_labels)
    eval_labels = np.array(eval_labels)
    print(train_features.shape)
    print(eval_features.shape)
    print(train_labels.shape)
    print(eval_labels.shape)
    
    write_file(train_features, train_labels, "train_baseline.txt")
    write_file(eval_features, eval_labels, "eval_baseline.txt")

In [8]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", fontsize=16,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(title + '.png')

In [9]:
def get_conf_matrix(pred, labels, classes, title):
    plt.figure()
    conf_matrix = np.zeros((len(classes), len(classes)), dtype=np.float32)
    for i in range(labels.shape[0]):
        conf_matrix[labels[i]][pred[i]] += 1
    plot_confusion_matrix(conf_matrix, 
                          classes=classes, 
                          normalize=True, 
                          title=title)

In [10]:
city_month_features = {}
for tweets_file in os.listdir(TWEET_PATH):
    get_features(tweets_file, "onion", city_month_features)
aggregate_features_and_labels(city_month_features)

('Processing:', 'Kohima_tweets.csv')
('Processing:', 'Cuttack_tweets.csv')
('Processing:', 'Lucknow_tweets.csv')
('200000', 'tweets processed...')
('400000', 'tweets processed...')
('Processing:', 'Panchkula_tweets.csv')
('Processing:', 'Siliguri_tweets.csv')
('Processing:', 'Delhi_tweets.csv')
('200000', 'tweets processed...')
('400000', 'tweets processed...')
('600000', 'tweets processed...')
('800000', 'tweets processed...')
('1000000', 'tweets processed...')
('1200000', 'tweets processed...')
('1400000', 'tweets processed...')
('1600000', 'tweets processed...')
('1800000', 'tweets processed...')
('2000000', 'tweets processed...')
('2200000', 'tweets processed...')
('2400000', 'tweets processed...')
('2600000', 'tweets processed...')
('2800000', 'tweets processed...')
('3000000', 'tweets processed...')
('3200000', 'tweets processed...')
('3400000', 'tweets processed...')
('3600000', 'tweets processed...')
('3800000', 'tweets processed...')
('4000000', 'tweets processed...')
('420000

In [15]:
f = open('deep_learning/data/tweet_features.txt', "w")
for city_month, feat in city_month_features.items():
    f.write(city_month)
    for i in range(feat.shape[0]):
        f.write("\t%.10f" % feat[i])
    f.write("\n")
f.close()

In [16]:
train_features, eval_features = [], []
train_labels, eval_labels = [], []

train_f = open("train_baseline.txt", "r")
eval_f = open("eval_baseline.txt", "r")
for line in train_f:
    train_features.append([float(x) for x in line.split()[:-2]])
    train_labels.append([int(x) for x in line.split()[-2:]])
for line in eval_f:
    eval_features.append([float(x) for x in line.split()[:-2]])
    eval_labels.append([int(x) for x in line.split()[-2:]])
eval_f.close()

train_features = np.array(train_features)
eval_features = np.array(eval_features)
train_labels = np.array(train_labels)
eval_labels = np.array(eval_labels)
print(train_features.shape)
print(eval_features.shape)
print(train_labels.shape)
print(eval_labels.shape)

(186, 5)
(70, 5)
(186, 2)
(70, 2)


In [31]:
clf_price_direction = linear_model.RidgeClassifier(alpha=0.01, normalize=True)
clf_price_direction.fit(train_features, train_labels[:,0])
train_label_dist = np.zeros(3, dtype=np.int32)
for i in range(train_labels.shape[0]):
    train_label_dist[train_labels[i][0]] += 1
print(train_label_dist)
eval_label_dist = np.zeros(3, dtype=np.int32)
for i in range(eval_labels.shape[0]):
    eval_label_dist[eval_labels[i][0]] += 1
print(eval_label_dist)
print("Train accuracy:", clf_price_direction.score(train_features, train_labels[:,0]))
print("Evaluation accuracy:", clf_price_direction.score(eval_features, eval_labels[:,0]))
print("Coefficients:", clf_price_direction.coef_)
get_conf_matrix(clf_price_direction.predict(train_features), train_labels[:,0], 
                ['decrease', 'no change', 'increase'], 'Ridge Classifier Price Direction Train')
get_conf_matrix(clf_price_direction.predict(eval_features), eval_labels[:,0], 
                ['decrease', 'no change', 'increase'], 'Ridge Classifier Price Direction Validation')

[69 55 62]
[31 18 21]
('Train accuracy:', 0.44086021505376344)
('Evaluation accuracy:', 0.5142857142857142)
('Coefficients:', array([[-257.8800819 ,   29.50878007,   -2.61738445,  -21.09057279,
          14.35929479],
       [-502.11986814,  -25.62925131,    3.90576732,   13.21348329,
          -9.63812288],
       [ 759.99995005,   -3.87952876,   -1.28838287,    7.87708949,
          -4.72117192]]))


In [43]:
clf_price_spike = linear_model.RidgeClassifier(alpha=0.01, normalize=True)
clf_price_spike.fit(train_features, train_labels[:,1])
train_label_dist = np.zeros(2, dtype=np.int32)
for i in range(train_labels.shape[0]):
    train_label_dist[train_labels[i][1]] += 1
print(train_label_dist)
eval_label_dist = np.zeros(2, dtype=np.int32)
for i in range(eval_labels.shape[0]):
    eval_label_dist[eval_labels[i][1]] += 1
print(eval_label_dist)
print("Train accuracy:", clf_price_spike.score(train_features, train_labels[:,1]))
print("Evaluation accuracy:", clf_price_spike.score(eval_features, eval_labels[:,1]))
print("Coefficients:", clf_price_spike.coef_)
get_conf_matrix(clf_price_spike.predict(train_features), train_labels[:,1], 
                ['no spike', 'spike'], 'Ridge Classifier Price Spike Train')
get_conf_matrix(clf_price_spike.predict(eval_features), eval_labels[:,1], 
                ['no spike', 'spike'], 'Ridge Classifier Price Spike Validation')

[104  82]
[30 40]
('Train accuracy:', 0.6236559139784946)
('Evaluation accuracy:', 0.4857142857142857)
('Coefficients:', array([[496.88124087,   4.44311155,  -1.81362203, -11.91238662,
          9.74698041]]))
