In [1]:
# Imports
from collections import defaultdict
import csv
import datetime
import nltk
from nltk.tokenize import casual_tokenize
import numpy as np
import os

In [2]:
EMBEDDINGS_PATH = '../data/'
TWEET_PATH = '/mnt/mounted_bucket/'
week_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

In [20]:
embedding_keywords = set()
word_dict = {}
word_list = []
word_embeddings = []
with open(EMBEDDINGS_PATH + "glove.twitter.27B.50d.txt", "r") as f:
    for line in f:
        word = line.split()[0]
        if word.isalpha():
            for day in week_words:
                if day in word:
                    embedding_keywords.add(len(word_embeddings))
                    break
        word_dict[word] = len(word_embeddings)
        word_list.append(word)
        word_embeddings.append([float(x) for x in line.split()[1:]])
        while (len(word_embeddings[-1]) < 50):
            word_embeddings[-1].append(0.0)
        word_embeddings[-1] = word_embeddings[-1][:50]
word_embeddings = np.array(word_embeddings)
print("Word embeddings shape:", word_embeddings.shape)
print("Number of keywords:", len(embedding_keywords))
keyword_to_feat_index = {}
for i, idx in enumerate(embedding_keywords):
    keyword_to_feat_index[idx] = i

Word embeddings shape: (1193514, 50)
Number of keywords: 286


In [4]:
def get_date(d):
    year = int(d[:4])
    month = int(d[5:7])
    day = int(d[8:10])
    return datetime.date(year, month, day)

In [15]:
def get_tweets(tweet_file):
    date_to_tweets = defaultdict(list)
    with open(TWEET_PATH + tweet_file) as csvfile:
        reader = csv.DictReader(csvfile)
        missing_words = set()
        cnt = 0
        for row in reader:
            date = get_date(row['postedTime'])
            tweet = ' '.join([word for word in casual_tokenize(row['tweet']) if '@' not in word and 'http' not in word])
            tweet = tweet.replace('#', '').lower()
            tweet_embedding = []
            for word in tweet.split():
                if word in word_dict:
                    tweet_embedding.append(word_dict[word])
                else:
                    tweet_embedding.append(-1)
                    missing_words.add(word)
            date_to_tweets[date].append(tweet_embedding)
            cnt += 1
            if (cnt % 200000 == 0): 
                print(str(cnt),'tweets processed...')
        print("Number of missing words:", len(missing_words))
    return date_to_tweets

In [25]:
def get_features_and_labels(date_to_tweets):
    features, labels = [], []
    for date in date_to_tweets:
        hist = defaultdict(int)
        for tweet in date_to_tweets[date]:
            for idx in tweet:
                if idx in embedding_keywords:
                    hist[idx] += 1
        feat = np.zeros(len(embedding_keywords), dtype=np.float32)
        for idx, cnt in hist.items():
            feat[keyword_to_feat_index[idx]] = cnt
        features.append(feat/len(date_to_tweets[date]))
        labels.append(date.weekday())
    return np.array(features), np.array(labels)

In [13]:
def analyze_features(features, labels):
    conf_matrix = np.zeros((7, 7), dtype=np.int32)
    day_dict = {}
    for i, word in enumerate(week_words):
        day_dict[word] = i
        day_dict[word + 's'] = i
    for i in range(features.shape[0]):
        max_idx = np.argmax(features[i])
        for key, val in keyword_to_feat_index.items():
            if val == max_idx:
                if word_list[key] not in day_dict:
                    print("true:", week_words[labels[i]], "max:", word_list[key], "count:", features[i][val])
                else:
                    conf_matrix[labels[i]][day_dict[word_list[key]]] += 1
    return conf_matrix

In [16]:
date_to_tweets = get_tweets('Delhi_tweets.csv')

200000 tweets processed...
400000 tweets processed...
600000 tweets processed...
800000 tweets processed...
1000000 tweets processed...
1200000 tweets processed...
1400000 tweets processed...
1600000 tweets processed...
1800000 tweets processed...
2000000 tweets processed...
2200000 tweets processed...
2400000 tweets processed...
2600000 tweets processed...
2800000 tweets processed...
3000000 tweets processed...
3200000 tweets processed...
3400000 tweets processed...
3600000 tweets processed...
3800000 tweets processed...
4000000 tweets processed...
4200000 tweets processed...
4400000 tweets processed...
4600000 tweets processed...
4800000 tweets processed...
5000000 tweets processed...
5200000 tweets processed...
5400000 tweets processed...
5600000 tweets processed...
5800000 tweets processed...
6000000 tweets processed...
6200000 tweets processed...
Number of missing words: 1044520


In [26]:
features, labels = get_features_and_labels(date_to_tweets)
print('Features shape:', features.shape)
print('Labels shape:', labels.shape)

Features shape: (1045, 286)
Labels shape: (1045,)


In [27]:
conf_matrix = analyze_features(features, labels)
print(conf_matrix)

true: monday max: mondaymotivation count: 0.0028404845
true: monday max: mondaymotivation count: 0.0034078518
true: monday max: mondaymotivation count: 0.004001715
true: saturday max: sexysaturday count: 0.0052533993
true: friday max: fridayfeeling count: 0.010262562
true: wednesday max: winewednesday count: 0.0021008404
true: thursday max: throwbackthursday count: 0.0013708019
true: saturday max: supersaturday count: 0.0031308704
true: thursday max: throwbackthursday count: 0.00080153893
true: monday max: mondaymotivation count: 0.0041007614
true: thursday max: throwbackthursday count: 0.00062578224
true: monday max: mondaymotivation count: 0.0027737226
true: monday max: mondaymotivation count: 0.0043649063
true: thursday max: goodfriday count: 0.0005947071
true: wednesday max: winewednesday count: 0.0016709259
true: monday max: mondaymotivation count: 0.0053823018
true: tuesday max: transformationtuesday count: 0.00063582894
true: monday max: mondaymotivation count: 0.0039770217
true