In [1]:
# Imports
from collections import defaultdict
import csv
import datetime
import nltk
from nltk.tokenize import casual_tokenize
import numpy as np
import os

In [2]:
EMBEDDINGS_PATH = '../data/'
TWEET_PATH = '../data/city_tweets/'
week_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

In [3]:
embedding_keywords = set()
word_dict = {}
word_list = []
word_embeddings = []
with open(EMBEDDINGS_PATH + "glove.twitter.27B.50d.txt", "r") as f:
    for line in f:
        word = line.split()[0]
        if word.isalpha():
            for day in week_words:
                if day in word:
                    embedding_keywords.add(len(word_embeddings))
                    break
        word_dict[word] = len(word_embeddings)
        word_list.append(word)
        word_embeddings.append([float(x) for x in line.split()[1:]])
        while (len(word_embeddings[-1]) < 50):
            word_embeddings[-1].append(0.0)
        word_embeddings[-1] = word_embeddings[-1][:50]
word_embeddings = np.array(word_embeddings)
print("Number of keywords:", len(embedding_keywords))
keyword_to_feat_index = {}
for i, idx in enumerate(embedding_keywords):
    keyword_to_feat_index[idx] = i

('Number of keywords:', 282)


In [4]:
def get_date(d):
    year = int(d[:4])
    month = int(d[5:7])
    day = int(d[8:10])
    return datetime.date(year, month, day)

In [5]:
def get_tweets(tweet_file, missing_words):
    print("Processing:", tweet_file)
    date_to_tweets = defaultdict(list)
    with open(TWEET_PATH + tweet_file) as csvfile:
        reader = csv.DictReader(csvfile)
        cnt = 0
        for row in reader:
            date = get_date(row['postedTime'])
            tweet = ' '.join([word for word in casual_tokenize(row['tweet']) if '@' not in word and 'http' not in word])
            tweet = tweet.replace('#', '').lower()
            tweet_embedding = []
            for word in tweet.split():
                if word in word_dict:
                    tweet_embedding.append(word_dict[word])
                else:
                    tweet_embedding.append(-1)
                    missing_words.add(word)
            date_to_tweets[date].append(tweet_embedding)
            cnt += 1
            if (cnt % 200000 == 0): 
                print(str(cnt),'tweets processed...')
    return date_to_tweets

In [6]:
def get_features_and_labels(date_to_tweets):
    features, labels = [], []
    for date in date_to_tweets:
        hist = defaultdict(int)
        for tweet in date_to_tweets[date]:
            for idx in tweet:
                if idx in embedding_keywords:
                    hist[idx] += 1
        feat = np.zeros(len(embedding_keywords), dtype=np.float32)
        for idx, cnt in hist.items():
            feat[keyword_to_feat_index[idx]] = cnt
        features.append(feat/len(date_to_tweets[date]))
        labels.append(date.weekday())
    return features, labels

In [20]:
def analyze_features(features, labels):
    conf_matrix = np.zeros((7, 7), dtype=np.int32)
    for i in range(features.shape[0]):
        max_idx = np.argmax(features[i])
        for key, val in keyword_to_feat_index.items():
            if val == max_idx:
                for j, day in enumerate(week_words):
                    if day in word_list[key]:
                        conf_matrix[labels[i]][j] += 1
                        break
                break
    return conf_matrix

In [11]:
missing_words = set()
features, labels = [], []
for tweets_file in os.listdir(TWEET_PATH):
    date_to_tweets = get_tweets(tweets_file, missing_words)
    cur_features, cur_labels = get_features_and_labels(date_to_tweets)
    features += cur_features
    labels += cur_labels
features = np.array(features)
labels = np.array(labels)
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)
conf_matrix = analyze_features(features, labels)
print(conf_matrix)
print("Number of missing words:", len(missing_words))

('Processing:', 'Kohima_tweets.csv')
('Processing:', 'Cuttack_tweets.csv')
('Processing:', 'Lucknow_tweets.csv')
('200000', 'tweets processed...')
('400000', 'tweets processed...')
('Processing:', 'Panchkula_tweets.csv')
('Processing:', 'Siliguri_tweets.csv')
('Processing:', 'Delhi_tweets.csv')
('200000', 'tweets processed...')
('400000', 'tweets processed...')
('600000', 'tweets processed...')
('800000', 'tweets processed...')
('1000000', 'tweets processed...')
('1200000', 'tweets processed...')
('1400000', 'tweets processed...')
('1600000', 'tweets processed...')
('1800000', 'tweets processed...')
('2000000', 'tweets processed...')
('2200000', 'tweets processed...')
('2400000', 'tweets processed...')
('2600000', 'tweets processed...')
('2800000', 'tweets processed...')
('3000000', 'tweets processed...')
('3200000', 'tweets processed...')
('3400000', 'tweets processed...')
('3600000', 'tweets processed...')
('3800000', 'tweets processed...')
('4000000', 'tweets processed...')
('420000

In [18]:
np.random.seed(325)
train_idx = set(list(np.random.choice(features.shape[0], 35000, replace=False)))
train_file = open(EMBEDDINGS_PATH + "dow_train_feat.txt", "w")
val_file = open(EMBEDDINGS_PATH + "dow_val_feat.txt", "w")
for i in range(features.shape[0]):
    if i in train_idx:
        for j in range(features.shape[1]):
            train_file.write("%.10f " % features[i][j])
        train_file.write('%d\n' % labels[i])
    else:
        for j in range(features.shape[1]):
            val_file.write("%.10f " % features[i][j])
        val_file.write('%d\n' % labels[i])
train_file.close()
val_file.close()

In [29]:
delhi_date_to_tweets = get_tweets("Delhi_tweets.csv", missing_words)
dfeatures, dlabels = get_features_and_labels(delhi_date_to_tweets)
dfeatures = np.array(dfeatures)
dlabels = np.array(dlabels)
print("Features shape:", dfeatures.shape)
print("Labels shape:", dlabels.shape)
dconf_matrix = analyze_features(dfeatures, dlabels)
print(dconf_matrix)
output_file = open(EMBEDDINGS_PATH + "dow_delhi_feat.txt", "w")
for i in range(dfeatures.shape[0]):
    for j in range(dfeatures.shape[1]):
        output_file.write("%.10f " % dfeatures[i][j])
    output_file.write('%d\n' % dlabels[i])
output_file.close()

('Processing:', 'Delhi_tweets.csv')
('200000', 'tweets processed...')
('400000', 'tweets processed...')
('600000', 'tweets processed...')
('800000', 'tweets processed...')
('1000000', 'tweets processed...')
('1200000', 'tweets processed...')
('1400000', 'tweets processed...')
('1600000', 'tweets processed...')
('1800000', 'tweets processed...')
('2000000', 'tweets processed...')
('2200000', 'tweets processed...')
('2400000', 'tweets processed...')
('2600000', 'tweets processed...')
('2800000', 'tweets processed...')
('3000000', 'tweets processed...')
('3200000', 'tweets processed...')
('3400000', 'tweets processed...')
('3600000', 'tweets processed...')
('3800000', 'tweets processed...')
('4000000', 'tweets processed...')
('4200000', 'tweets processed...')
('4400000', 'tweets processed...')
('4600000', 'tweets processed...')
('4800000', 'tweets processed...')
('5000000', 'tweets processed...')
('5200000', 'tweets processed...')
('5400000', 'tweets processed...')
('5600000', 'tweets pro