In [1]:
# Imports
from collections import defaultdict
from contains_city import *
import csv
import dateutil.parser
import HTMLParser
from nearest_words import read_file, word_to_embedding
import nltk
from nltk.tokenize import casual_tokenize
import numpy as np
import os

In [102]:
# Constants
PATH = '../data/'
TWEET_PATH = '../../../data_utils/city_tweets/'
EMBEDDINGS_FILE = 'glove.twitter.27B.50d.txt'
MISSING_WORDS_FILE = 'missing_words.txt'
WORDS_FILE = 'words.txt'
TWEET_CNTS_FILE = 'tweet_counts.txt'
MIN_DIST = 4.5
NUM_MONTHS = 35
MIN_TWEETS = 200 # minimum number of tweets required per city-month
K = 50 # number of tweets per batch
NUM_RESAMPLES = 5 # number of times to resample from city-month tweets to generate batches

In [3]:
# Global variables
word_to_idx = dict() # word to index in glove embeddings file
city_to_changes = defaultdict(lambda: defaultdict(lambda: [0, 0])) # city to month to [trend, spike]

In [4]:
def create_words_file():
    print 'Begin creating words file...'
    with open(WORDS_FILE, 'w') as wf:
        with open(PATH + EMBEDDINGS_FILE, 'r') as ef:
            idx = 0
            for line in ef:
                word = line.split()[0]
                wf.write(word + '\n')
                word_to_idx[word] = idx
                idx += 1
    print 'Finished creating words file!'

In [5]:
create_words_file()

Begin creating words file...
Finished creating words file!


In [None]:
def output_tweet_counts(month_to_tweets):
    with open(TWEET_CNTS_FILE, 'w') as f:
        f.write(','.join([str(len(month_to_tweets[month])) for month in month_to_tweets]))

In [None]:
def output_missing_words(missing_words):
    with open(MISSING_WORDS_FILE, 'w') as f:
        for word in missing_words:
            f.write(word + '\n')

In [31]:
def output_embeddings(month_to_tweets, city):
    print 'Begin outputting word embeddings...'
    for month in month_to_tweets:
        if (len(month_to_tweets[month]) >= MIN_TWEETS):
            with open('%s/embeddings/%s_%d_embeddings.csv' % (PATH[:-1], city, month), 'w') as output:
                for tweet in month_to_tweets[month]:
                    output.write(','.join([str(num) for num in tweet]) + '\n')
    print 'Finished outputting word embeddings!'

In [74]:
def get_word_embeddings():
    print 'Begin getting word embeddings...'
    tweets_file = 'Delhi_tweets.csv'
    title = 'delhi'
    cnt = 0
    with open(PATH + tweets_file) as csvfile:
        month_to_tweets = defaultdict(list)
        reader = csv.DictReader(csvfile)
        missing_words = set()
        missing_words_cnt = 0
        for row in reader:
            if (cnt % 100000 == 0): print str(cnt) + ' tweets processed...'
            cnt += 1
            date = dateutil.parser.parse(row['postedTime'])
            month_idx = (date.year - 2014)*12 + (date.month - 1)
            tweet = ' '.join([word for word in casual_tokenize(row['tweet']) 
                              if '@' not in word and 'http' not in word and '#' not in word])
            tweet = tweet.lower()
            tweet_embedding = []
            for word in tweet.split():
                if (word in word_to_idx):
                    tweet_embedding.append(word_to_idx[word])
                else:
                    missing_words_cnt += 1
                    missing_words.add(word)
            if (len(tweet_embedding) > 0): month_to_tweets[month_idx].append(tweet_embedding)
            output_embeddings(month_to_tweets, title)
            output_tweet_counts(month_to_tweets)
            output_missing_words(missing_words)
    print 'Finished getting word embeddings!'
    print 'Number of missing words: ' + str(missing_words_cnt)

In [1]:
get_word_embeddings()

NameError: name 'get_word_embeddings' is not defined

In [2]:
def get_deviations():
    print 'Getting deviations...'
    with open(PATH + 'India_Onion_Prices_Vector.csv') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)
        for row in reader:
            city = row[0]
            trends = row[2:NUM_MONTHS + 2]
            spikes = row[NUM_MONTHS + 3:2*NUM_MONTHS + 3]
            for i in range(NUM_MONTHS):
                trend = int(trends[i]) + 1 if (trends[i] != 'NA') else -1
                spike = int(spikes[i]) if (spikes[i] != 'NA') else -1
                city_to_changes[city][i] = [trend, spike]
    print ' -done.'

In [6]:
def is_relevant(tweet, keyword_embedding):
    for word in tweet.split():
        if (word in word_to_embedding):
            embedding = np.array(word_to_embedding[word])
            dist = np.linalg.norm(embedding - keyword_embedding)
            if (dist <= MIN_DIST):
                return True
    return False

In [75]:
def output_batches(month_to_tweets, city):
    num_batches = 0
    for month in month_to_tweets:
        tweet_count = len(month_to_tweets[month])
        trend = city_to_changes[city][month][0]
        spike = city_to_changes[city][month][1]
        if (tweet_count >= MIN_TWEETS and trend != -1 and spike != -1):
            num_batches += create_batches(tweet_count, city, month, trend, spike)
    return num_batches

In [91]:
def create_batches(tweet_count, city_name, month_idx, trend, spike):
    np.random.seed(10)
    
    n = tweet_count
    rand_seq = []
    for i in range(NUM_RESAMPLES):
        cur = [str(num) for num in np.random.choice(n, n, replace=False)]
        remaining = K - (n % K) # used to make the length of rand_seq a multiple of K
        cur += [str(num) for num in np.random.choice(n, remaining, replace=False)]
        rand_seq += cur

    num_batches = len(rand_seq) / K
    folder = 'batches2'
    output_file = '%s%s/%s_%s_batch.txt' % (PATH, folder, city_name, str(month_idx))
    with open(output_file, 'w') as output:
        output.write('%d,%d\n' % (trend, spike))
        for i in range(num_batches):
            suffix = '\n'
            if (i == num_batches - 1): suffix = ''
            output.write('\t'.join(rand_seq[i * K : (i + 1) * K]) + suffix)
    
    with open(PATH + 'batch_counts.txt', 'a') as f:
        f.write('%s,%d,%d\n' % (city_name, month_idx, num_batches))
        
    return num_batches

In [63]:
def get_relevant_word_embeddings(keyword):
    keyword_embedding = np.array(word_to_embedding[keyword])
    total_batches = 0
    
    for tweets_file in os.listdir(PATH):
        if ('.csv' not in tweets_file) or ('India_Onion_Prices' in tweets_file): continue
        if ('Mumbai' not in tweets_file): continue
        print 'Reading ' + tweets_file + '...'
        city = tweets_file.split('_')[0]
        tweets_processed = 0
        month_to_tweets = defaultdict(list)

        with open(PATH + tweets_file) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if (tweets_processed % 1e5 == 0): print str(tweets_processed) + ' tweets processed...'
                tweets_processed += 1
                date = dateutil.parser.parse(row['postedTime'])
                month_idx = (date.year - 2014)*12 + (date.month - 1)
                tweet = ' '.join([word for word in casual_tokenize(row['tweet']) 
                                  if '@' not in word and 'http' not in word and '#' not in word])
                tweet = tweet.lower()
                tweet_embedding = []
                if (is_relevant(tweet, keyword_embedding)):
                    for word in tweet.split():
                        if (word in word_to_idx):
                            tweet_embedding.append(word_to_idx[word])
                if (len(tweet_embedding) > 0): 
                    month_to_tweets[month_idx].append(tweet_embedding)
                
        output_embeddings(month_to_tweets, city)
        
#         print 'Outputting batches...'
#         batches_created = output_batches(month_to_tweets, city)
        
#         total_batches += batches_created
#         print 'Batches so far: %d' % total_batches
#         print ' -done.'
    
#     print 'Total batches: %d' % total_batches

In [7]:
# Reads embeddings files and creates word_to_embedding map
read_file(PATH + EMBEDDINGS_FILE)

NameError: name 'read_file' is not defined

In [12]:
# Stores all price changes and spikes in city_to_changes
get_deviations()

Getting deviations...
 -done.


In [6]:
# Creates all tweet embeddings related to given parameter and outputs batches
get_relevant_word_embeddings('onion')

NameError: name 'get_relevant_word_embeddings' is not defined

In [96]:
city_to_tweets = defaultdict(lambda: [0 for i in range(NUM_MONTHS)])

# Read embeddings files
def read_embeddings_files():
    path = '../data/embeddings/'
    total = 0
    for embeddings_file in os.listdir(path):
        filename = embeddings_file.split('_')
        city = filename[0]
        month = int(filename[1])
        with open(path + embeddings_file, 'r') as csvfile:
            num_tweets = sum(1 for row in csv.reader(csvfile))
            city_to_tweets[city][month] += num_tweets
            total += num_tweets
    print 'Total number of tweets: %d' % total
            
# Outputs batches by reading embeddings files to get tweet counts
def output_batches_1():
    num_batches = 0
    for city in city_to_tweets:
        city_batches = 0
        for month in range(NUM_MONTHS):
            tweet_count = city_to_tweets[city][month]
            trend = city_to_changes[city][month][0]
            spike = city_to_changes[city][month][1]
            if (tweet_count >= MIN_TWEETS and trend != -1 and spike != -1):
                batches = create_batches(tweet_count, city, month, trend, spike)
                city_batches += batches
        
        num_batches += city_batches
        print 'city: %s, batches: %d' % (city, city_batches)
        
    print 'Total number of batches: %d' % num_batches

read_embeddings_files()
output_batches_1()

Total number of tweets: 638606
city: Chandigarh, batches: 1328
city: Gurgaon, batches: 4040
city: Chennai, batches: 1992
city: Lucknow, batches: 1376
city: Hyderabad, batches: 4128
city: Delhi, batches: 24560
city: Mumbai, batches: 38520
city: Bengaluru, batches: 18424
city: Jaipur, batches: 1264
city: Kolkata, batches: 6952
Total number of batches: 102584


In [76]:
def create_weekly_batches(city_name, tweet_counts, tweet_idxes, trends, label):
    np.random.seed(10)
    
    time_len = len(tweet_counts)
    weekly_rand_seqs = []
    
    for week in range(time_len):
        n = tweet_counts[week]
        idxes = tweet_idxes[week]
        rand_seq = []
        for i in range(NUM_RESAMPLES):
            cur = [str(num) for num in np.random.choice(idxes, n, replace=False)]
            remaining = K - (n % K) # used to make the length of rand_seq a multiple of K
            cur += [str(num) for num in np.random.choice(idxes, remaining, replace=False)]
            rand_seq += cur
        weekly_rand_seqs.append(rand_seq)
    
    num_batches = min([len(weekly_rand_seqs[i]) for i in range(time_len)]) / K
    folder = 'batches'
    output_file = '%s%s/%s_weekly_batch.txt' % (PATH, folder, city_name)
    with open(output_file, 'a') as output:
        for i in range(num_batches):
            suffix = '\n'
            if (i == num_batches - 1): suffix = ''
            batch = ''
            for week in range(time_len):
                batch += '\t'.join(weekly_rand_seqs[week][i * K : (i + 1) * K]) + '\t'
            trendstr = '\t'.join([str(x) for x in trends])
            batch = batch + trendstr + '\t' + str(label)
            output.write(batch + suffix)
            assert(len(batch.split('\t')) == 154)
        
    return num_batches

In [8]:
def output_embeddings_by_city(tweets, city):
    print 'Begin outputting word embeddings...'
    with open('%s/embeddings/%s_embeddings.csv' % (PATH[:-1], city), 'w') as output:
        for tweet in tweets:
            output.write(','.join([str(num) for num in tweet]) + '\n')
    print 'Finished outputting word embeddings!'

In [43]:
# Returns map from city to (valid date (of fourth week), trends (of previous three weeks), label (of current week))
def get_valid_city_weeks():
    city_to_weekly_prices = get_city_to_weeks()
    city_to_valid_weeks = defaultdict(lambda: defaultdict(list))
    for city in city_to_weekly_prices:  
        all_weeks = city_to_weekly_prices[city]
        for week in range(len(all_weeks)):
            if (week < 4): continue
            cur = all_weeks[week]
            cur_date = date(cur[0], cur[1], cur[2])
            is_valid = True
            for i in range(3):
                prev_week = all_weeks[week - i - 1]
                if cur_date - date(prev_week[0], prev_week[1], prev_week[2]) > timedelta(days = (i + 1)*7):
                    is_valid = False
                    break
            if (is_valid):
                prev_trends = [all_weeks[i][3] for i in range(week - 3, week)]
                city_to_valid_weeks[city][cur_date] = [prev_trends, cur[3]]
    return city_to_valid_weeks

In [101]:
# Outputs all relevant word embeddings in a city to one city file: 'city_embeddings.csv'
def get_relevant_word_embeddings_by_city(keyword):
    city_to_valid_weeks = get_valid_city_weeks()
    
    keyword_embedding = np.array(word_to_embedding[keyword])
    total_batches = 0     
    
    for tweets_file in os.listdir(TWEET_PATH):
        if ('.csv' not in tweets_file) or ('India_Onion_Prices' in tweets_file): continue
        print 'Reading ' + tweets_file + '...'
        city = tweets_file.split('_')[0].lower()
        if city not in city_to_valid_weeks:
            continue
        valid_weeks = city_to_valid_weeks[city]
        tweet_idx = 0
        tweets_processed = 0
        city_tweets = []
        date_to_valid_tweets = defaultdict(lambda: [[0, 0, 0], [[], [], []]]) # date to list of [tweet cnts, tweet idxes]
        
        with open(TWEET_PATH + tweets_file) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if (tweets_processed % 1e5 == 0): print str(tweets_processed) + ' tweets processed...'
                tweets_processed += 1
                tweet_date = date(int(row['postedTime'][0:4]), int(row['postedTime'][5:7]), int(row['postedTime'][8:10]))
                    
                tweet = ' '.join([word for word in casual_tokenize(row['tweet']) 
                                  if '@' not in word and 'http' not in word and '#' not in word])
                tweet = tweet.lower()
                tweet_embedding = []
                if (is_relevant(tweet, keyword_embedding)):
                    for word in tweet.split():
                        if (word in word_to_idx):
                            tweet_embedding.append(word_to_idx[word])
                if (len(tweet_embedding) > 0): 
                    city_tweets.append(tweet_embedding) 
                    
                    for valid_date in valid_weeks:
                        prev_week = None
                        if (valid_date - tweet_date <= timedelta(days = 7)):
                            prev_week = 0
                        elif(valid_date - tweet_date <= timedelta(days = 14)):
                            prev_week = 1
                        elif(valid_date - tweet_date <= timedelta(days = 21)):
                            prev_week = 2

                        if prev_week == None: continue
                        date_to_valid_tweets[valid_date][0][prev_week] += 1
                        date_to_valid_tweets[valid_date][1][prev_week].append(tweet_idx)  
                    
                    tweet_idx += 1
                
        output_embeddings_by_city(city_tweets, city)
        
        total_batches = 0
        for valid_date in date_to_valid_tweets:
            tweet_info = date_to_valid_tweets[valid_date]
            tweet_counts = tweet_info[0]
            tweet_idxes = tweet_info[1]
            trends = valid_weeks[valid_date][0]
            label = valid_weeks[valid_date][1]
            
            valid_batch = True
            for cnt in tweet_counts:
                if cnt < MIN_TWEETS:
                    valid_batch = False
                    break        
            if (valid_batch):
                total_batches += create_weekly_batches(city, tweet_counts, tweet_idxes, trends, label)
        print city, total_batches

In [18]:
read_file(PATH + EMBEDDINGS_FILE)

Reading file...
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
 -done


In [100]:
get_relevant_word_embeddings_by_city('onion')

Reading Kohima_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
kohima 0
Reading Cuttack_tweets.csv...
0 tweets processed...
100000 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
cuttack 0
Reading Lucknow_tweets.csv...
0 tweets processed...
100000 tweets processed...
200000 tweets processed...
300000 tweets processed...
400000 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
lucknow 9220
Reading Panchkula_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
panchkula 50
Reading Siliguri_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
siliguri 0
Reading Delhi_tweets.csv...
0 tweets processed...
100000 tweets processed...
200000 tweets processed...
300000 tweets processed...
400000 tweets processed...
500000 tweets processed.

100000 tweets processed...
200000 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
bhopal 3945
Reading Jammu_tweets.csv...
0 tweets processed...
100000 tweets processed...
200000 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
jammu 5495
Reading Jaipur_tweets.csv...
0 tweets processed...
100000 tweets processed...
200000 tweets processed...
300000 tweets processed...
400000 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
jaipur 5630
Reading Srinagar_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
srinagar 0
Reading Rourkela_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
rourkela 0
Reading Puducherry_tweets.csv...
0 tweets processed...
Begin outputting word embeddings...
Finished outputting word embeddings!
puducherry 0
Reading Ludhia