In [45]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from functools import reduce
import math
from collections import Counter
import csv
from datetime import datetime
import re
import copy
import json
from datetime import timedelta
%matplotlib inline

In [46]:
def now():
    return str(datetime.now().time())[:8]
def pr(strToPrint):
    print(now() + ' '+ strToPrint)

from IPython.display import Audio
sound_file = 'beep.wav'
# pd.set_option('display.max_colwidth', -1)

## Importing data

Importing a sample of the dataset:

In [47]:
pickle_filename = os.path.join('data','head_100k_pickle.pkl')
tw = pd.read_pickle(pickle_filename)

Importing the whole dataset

In [48]:
columns_header = ['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId',
                  'inReplyTo', 'source', 'truncated', 'placeLatitude', 'placeLongitude', 'sourceName', 'sourceUrl',
                 'userName', 'screenName', 'followersCount', 'friendsCount', 'statusesCount',
                 'userLocation']

filename = os.path.join('data','twex.tsv') # 'sample.tsv')
pr('Starting to read file... (3 min)')
tw = pd.read_csv(filename, sep='\t', encoding='utf-8', escapechar='\\', names=columns_header,
                      quoting=csv.QUOTE_NONE, na_values='N', header=None)

pr('File is loaded!')
# Audio(url=sound_file, autoplay=True)

11:35:07 Starting to read file... (3 min)


  interactivity=interactivity, compiler=compiler, result=result)


11:37:34 File is loaded!


In [49]:
tw.head(2)

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation
0,9514097914,17341000.0,2010-02-23 05:55:51,Guuuuten Morgen! :-),7.43926,46.9489,,,197,,,,TwitBird,http://www.nibirutech.com,Tilman Jentzsch,blickwechsel,586,508.0,9016.0,"Bern, Switzerland"
1,9514846412,7198280.0,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,,,550,,,,Gowalla,http://gowalla.com/,Nico Luchsinger,halbluchs,1820,703.0,4687.0,"Zurich, Switzerland"


## Extrating hashtags

In [50]:
def extract_hashtags(text):
    ht_list = re.findall(r"#(\w+)", text)
    non_empty_hts = list(filter((lambda ht: ht != []), ht_list))
    lowerCharList = [ht.lower() for ht in non_empty_hts]
    return lowerCharList

In [51]:
pr('Making hashtags... (2 min)')
tw['hashtag'] = np.nan
tw.hashtag = tw.text.apply(lambda x: extract_hashtags(str(x)))
twh = tw.ix[tw.hashtag.apply(lambda x: len(x) != 0)]
pr('Done.')

11:37:34 Making hashtags... (2 min)
11:39:25 Done.


In [52]:
twh.head(2)

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
8,9519737890,14657900.0,2010-02-23 09:59:41,"Magic spells run off after midnight, I guess s...",6.1387,46.175,,,1,,...,,Twitter for iPhone,http://twitter.com/#!/download/iphone,Javier Belmonte,vichango,167,277.0,2885.0,"Geneva, Switzerland",[fb]
10,9521789689,9962020.0,2010-02-23 11:28:27,"Limitas of public transportation! No taxi, rai...",6.33641,46.4631,,,550,,...,,Gowalla,http://gowalla.com/,Thomas Winter,thwinter,1070,1359.0,3349.0,Hettlingen CH / SanJose Ca,[yam]


## Cleaning data and making date index

In [53]:
tw1 = twh.dropna(axis=0, how='any', subset=['text', 'createdAt'])
print('The data have been reduced from {} tweets to {} tweets.'.format(len(twh), len(tw1)))

The data have been reduced from 3875280 tweets to 3875280 tweets.


In [54]:
pr('Removing bad dates...')
twhCleanDate = tw1[tw1['createdAt'].str.len() == 19]
pr('Finished.')

11:39:28 Removing bad dates...
11:39:31 Finished.


In [55]:
pr('Starting to examine dates...')
import warnings
warnings.filterwarnings('ignore')
datetime_serie = twhCleanDate['createdAt'].convert_objects(convert_dates='coerce')
dateNotConvertible = datetime_serie[pd.isnull(datetime_serie)]
warnings.filterwarnings('default')
pr('There are {} dates that cannot be transformed.'.format(len(dateNotConvertible)))

11:39:31 Starting to examine dates...
11:39:33 There are 0 dates that cannot be transformed.


In [56]:
pr('Starting copy...') # (to avoid transformation problems)
tw5 = twhCleanDate.copy()
pr('Converting to datetime...')
tw5['createdAt'] = pd.to_datetime(twhCleanDate['createdAt'])
pr('Setting up new indices...')
tw5.index = tw5['createdAt']
pr('Deleting old "createdAt" column...')
del tw5['createdAt']
pr('Done!')
tw5.head(2)

11:39:33 Starting copy...
11:39:34 Converting to datetime...
11:39:37 Setting up new indices...
11:39:37 Deleting old "createdAt" column...
11:39:37 Done!


Unnamed: 0_level_0,id,userId,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-02-23 09:59:41,9519737890,14657900.0,"Magic spells run off after midnight, I guess s...",6.1387,46.175,,,1,,,,Twitter for iPhone,http://twitter.com/#!/download/iphone,Javier Belmonte,vichango,167,277.0,2885.0,"Geneva, Switzerland",[fb]
2010-02-23 11:28:27,9521789689,9962020.0,"Limitas of public transportation! No taxi, rai...",6.33641,46.4631,,,550,,,,Gowalla,http://gowalla.com/,Thomas Winter,thwinter,1070,1359.0,3349.0,Hettlingen CH / SanJose Ca,[yam]


In [57]:
tw5['hashtag'][:6]

createdAt
2010-02-23 09:59:41               [fb]
2010-02-23 11:28:27              [yam]
2010-02-23 17:47:11          [24, vfb]
2010-02-23 18:19:03    [iphoneography]
2010-02-23 18:31:46     [partnermonth]
2010-02-24 06:09:23      [insider, fb]
Name: hashtag, dtype: object

## Let's put one hashtag per row

We will make a dataframe with one row = one hashtag. This will be done by going through the dataframe, and making in parallel a list of rows (with 1 hashtag per row) that needs to be added to the old dataframe.

In [58]:
addedHashtagsRowsList = []
def multiplyHashtagRows(row, columns):
    '''
    Examine each row. If there are multiple hashtags, it will return the first one.
    (so the first one will replace the list of hashtags in the df). Then for all the next ones,
    it will make a copy of the row in the addedHashtagsRowsList, (in a dictionary format).
    So this dictionary can in the end be transformed in a DF and added to the original DF.
    (The speed is increased a lot by doing it this way!)
    '''
    htList = row.hashtag
    if len(htList) > 1:
        ## Making the dictionary
        addedHashtag = {}
        addedHashtag['createdAt'] = row.name #the df index
        for col in columns:
            addedHashtag[col] = row[col]
        ## Copying the dict for each hashtag
        i = 1
        while i < len(htList) :
            deepCopy = copy.deepcopy(addedHashtag)
            deepCopy['hashtag'] = htList[i]
            addedHashtagsRowsList.append(deepCopy)
            i+=1
    return htList[0] # return the first hashtag

In [59]:
addedHashtagsRowsList = []
tw5_1 = tw5.copy()
pr('Multiplying the hashtag rows... (around 10 min)')
tw5_1['hashtag'] = tw5.apply(multiplyHashtagRows, args=[tw5.columns,], axis=1)
pr('Finished! {} rows will be added to the dataframe!'.format(len(addedHashtagsRowsList)))

11:39:38 Multiplying the hashtag rows... (around 10 min)
11:48:37 Finished! 3350600 rows will be added to the dataframe!


In [60]:
pr('Starting to make the new dataframe with additionnal rows..')
addedHashtagsDf = pd.DataFrame(addedHashtagsRowsList)
addedHashtagsDf.set_index(['createdAt'], inplace=True)
pr('Starting to append the two df... Old df size = {}'.format(len(tw5_1)))
tw6 = tw5_1.append(addedHashtagsDf)
pr('Done! New df size = {}'.format(len(tw6)))

11:48:37 Starting to make the new dataframe with additionnal rows..
11:48:50 Starting to append the two df... Old df size = 3875280
11:48:56 Done! New df size = 7225880


In [61]:
print('Example hahshtag:')
tw6[tw6['hashtag'] == addedHashtagsRowsList[0]['hashtag']].head(3)

Example hahshtag:


Unnamed: 0_level_0,followersCount,friendsCount,hashtag,id,inReplyTo,latitude,longitude,placeId,placeLatitude,placeLongitude,screenName,source,sourceName,sourceUrl,statusesCount,text,truncated,userId,userLocation,userName
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2011-07-24 16:03:13,1121,763.0,vfb,95162125584039936,9.501397e+16,,,e401fb8eb4e7595a,47.5356,9.14004,eLd0raDo,14,Tweetbot for iOS,http://tapbots.com/tweetbot,6735.0,"@mikstweed Wie lange ist es her, dass der #vfb...",,921241.0,"Kreuzlingen, TG, Switzerland",Markus Tressl
2012-07-29 16:15:54,1008,767.0,vfb,229611214802669570,,47.6683,9.48077,7e00050f2f9230bc,47.6684,9.47092,jens_nagler,1,Twitter for iPhone,http://twitter.com/#!/download/iphone,1697.0,Man sollte ja nicht zu viel reininterpretieren...,,57138500.0,Esslingen,Jens Nagler
2012-07-29 16:21:45,1008,767.0,vfb,229612686525231104,2.296124e+17,47.6682,9.48068,7e00050f2f9230bc,47.6684,9.47092,jens_nagler,1,Twitter for iPhone,http://twitter.com/#!/download/iphone,1697.0,"@LLcurly Das Besondere daran ist, dass nur zwe...",,57138500.0,Esslingen,Jens Nagler


In [62]:
tw6.hashtag.head()

createdAt
2010-02-23 09:59:41               fb
2010-02-23 11:28:27              yam
2010-02-23 17:47:11               24
2010-02-23 18:19:03    iphoneography
2010-02-23 18:31:46     partnermonth
Name: hashtag, dtype: object

## Grouping per hashtags per day

In [63]:
tw6.dropna(subset=['longitude'], inplace=True)
tw6.dropna(subset=['latitude'], inplace=True)

In [64]:
tw6.latitude = tw6.latitude.apply(float)
tw6.longitude = tw6.longitude.apply(float)

In [65]:
delimiter = '_$$$_'
str_join = lambda x: delimiter.join(x)

Function that applies to a dataframe will group each row by day and aggregate all its content:

In [66]:
def aggDate(df):
    groupedDf = df.groupby(df.index.map(lambda x: x.date)).agg({'text' : str_join,
                                                          'longitude' : np.median,
                                                          'latitude' : np.median,
                                                          'hashtag' : lambda x: x.iloc[0], ## the first occurence
                                                          'numberOfTweets' : 'count'})
    return groupedDf

In [67]:
pr('Making column number of tweets')
tw6['numberOfTweets'] = 1
pr('Starting group by hastag...')
gp = tw6.groupby('hashtag')
pr('Starting to put hashtag in dictionary... (around 50 min)')

count = 0
lengp = len(gp)
printingValue = int(lengp / 10)
dictionary = {}
for hashtag, df in gp:
    dictionary[hashtag] = aggDate(df)
    count += 1
    if count % printingValue == 0:
        pr("{:.0f}%".format(count/lengp*100))
pr('Finished operations! Dictionary with {} different hashtags.'.format(len(dictionary)))

11:49:11 Making column number of tweets
11:49:11 Starting group by hastag...
11:49:11 Starting to put hashtag in dictionary... (around 50 min)
11:54:20 10%
11:58:57 20%
12:03:30 30%
12:08:06 40%
12:12:42 50%
12:17:17 60%
12:21:49 70%
12:26:22 80%
12:30:55 90%
12:35:24 100%
12:35:24 Finished operations! Dictionary with 607601 different hashtags.


In [68]:
# aggDate(gp.get_group('twibisg')).head(3)
# dictionary['twibisg'].head(3)
print('Dictionary with hashtags dataframes:')
dictionary[list(dictionary.keys())[5]].head(4)

Dictionary with hashtags dataframes:


Unnamed: 0,longitude,numberOfTweets,text,latitude,hashtag
2016-08-02,7.33389,11,Le service du vin chez #ilcortile #mulhouse #c...,47.7475,ilcortile


## Starting event detection

Parameters that define events:

In [69]:
## Parameters of an event:
MIN_TOT_NB_TWEETS = 20 ## The hashtag must have happened at least this number of times to be considered.
MIN_NB_DAYS_WITH_HASHTAGS = 3 ## The hashtags must appear at least this number of different days to be considered.
MIN_NB_TWEETS_DURING_EVENT = 5 ## To be considered an event, the hashtag must happen at least this nb of times during the day.
THRESHOLD_ANOMALY_FACTOR = 2 ## The occurence of a hashtag during a single day must be above the mean by this FACTOR
                             ## multiplied by the std to be considered as an anomaly.
MAX_DURATION_OF_EVENT = timedelta(days=30) ## The maximum number of days we consider an event can happen
MIN_DURATION_BEFORE_NEW_EVENT = timedelta(days=304) ## (= 10 months) The min time that should pass before an event can happen
                                                    ## again and still be considered as event (ie. Christmas is an event
                                                    ## each year)

Helper functions to detect recurrent events that should be removed:

In [70]:
def isSpecificEventListIllegal(detectedEventDateList):
    '''
    Return true if the list of dates contain illegal tupples of events, so if the event is recurrent
    which would mean it is not a real event.
    '''
    def datesAreIllegal(date1, date2, date3):
        '''
        Return true if the 3 dates are not to be considered as regular events.
        '''
        ## Return if the difference is too small to be considered as 2 different events
        def diffIsSmall(timeDiff):  
            return timeDiff < MAX_DURATION_OF_EVENT

        ## Return true if the difference is not big enough to be an annual event.
        def isDiffSuspect(timeDiff):
            return timeDiff < MIN_DURATION_BEFORE_NEW_EVENT   

        diff1 = abs(date1 - date2)
        diff2 = abs(date2 - date3)
        diff3 = abs(date3 - date1)

        ## The difference is too small, it must be the same event
        if diffIsSmall(diff1) or diffIsSmall(diff2) or diffIsSmall(diff3):
            return False

        ## If there are at least 2 out of 3 suspect difference, then the dates are illegal
        if isDiffSuspect(diff1):
            return isDiffSuspect(diff2) or isDiffSuspect(diff3)
        else:
            return isDiffSuspect(diff2) and isDiffSuspect(diff3)
    
    ## MAIN FUNCTION : ##
    # Go through the list of events and try all "triples" to see if there is any illegal triples. This is a quickly done
    # code to do that. Code complexity bellow is in O(k^3), with k being the size of the list. We will apply this function
    # to n list so we will have an overall complexity in O(n*k^3). We can consider however that each list will
    # be small so k can be considered as constant and therefore the overall complexity will be in O(n).
    for i in range(len(detectedEventDateList) - 2):
        for j in range(i, len(detectedEventDateList) - 1):
            for k in range(j, len(detectedEventDateList)):
                if datesAreIllegal(detectedEventDateList[i], detectedEventDateList[j], detectedEventDateList[k]):
                    return True
    return False

In [71]:
pr('Starting to compute {} dict items to detect event. (4 min)'.format(len(dictionary)))
nbOfEventDetected = 0
count = 0
printingValue = int(len(dictionary) / 10)
for [h,df] in dictionary.items():
    count += 1
    if count % printingValue == 0:
        pr("{:.0f}%".format(count/len(dictionary)*100))
    df['event'] = False
    if len(df) > MIN_NB_DAYS_WITH_HASHTAGS:
        if df['numberOfTweets'].sum() > MIN_TOT_NB_TWEETS:
            threshold = df['numberOfTweets'].mean() + THRESHOLD_ANOMALY_FACTOR * df['numberOfTweets'].std()
            df['event'] = df.numberOfTweets.apply(lambda x: x > threshold and x > MIN_NB_TWEETS_DURING_EVENT)
            
            ## Remove recurrent events:
            detectedEventDf = df[df['event']]
            if len(detectedEventDf) > 3 and isSpecificEventListIllegal(detectedEventDf.index):
                df['event'] = False
            nbOfEventDetected += len(df[df['event']])
pr('Finished! Number of events detected = {}'.format(nbOfEventDetected))

12:35:24 Starting to compute 607601 dict items to detect event. (4 min)
12:35:56 10%
12:36:19 20%
12:36:41 30%
12:37:04 40%
12:37:26 50%
12:37:49 60%
12:38:11 70%
12:38:34 80%
12:38:56 90%
12:39:19 100%
12:39:19 Finished! Number of events detected = 9611


## Making single event dataframe

In [72]:
eventRowsList = []
def applyToMakeEventDf(row):
    if row.event:
        rowToAdd = {'date': row.name, 'hashtag': row.hashtag, 'text': row.text,
                    'longitude': row.longitude, 'latitude':row.latitude, 'numberOfTweets': row.numberOfTweets, }
        eventRowsList.append(rowToAdd)

In [73]:
eventRowsList = []
count = 0
printingValue = int(len(dictionary) / 10)

pr('Starting to make event df with {} dataframes. (around 6 min)'.format(len(dictionary)))
for h, df in dictionary.items():
    count += 1
    if count % printingValue == 0:
        pr("{:.0f}%".format(count/len(dictionary)*100))
        
    ########## CHANGE DF TO MERGE CLOSE EVENTS ################
    
    df.apply(applyToMakeEventDf, axis=1)

pr('Making new dataframe.')
new_events = pd.DataFrame(eventRowsList)
new_events.set_index(['date'], inplace=True)
pr('Finished! Dataframe with {} rows'.format(len(new_events)))

12:39:19 Starting to make event df with 607601 dataframes. (around 6 min)
12:39:52 10%
12:40:24 20%
12:40:57 30%
12:41:29 40%
12:42:02 50%
12:42:35 60%
12:43:08 70%
12:43:41 80%
12:44:14 90%
12:44:47 100%
12:44:47 Making new dataframe.
12:44:47 Finished! Dataframe with 9611 rows


In [74]:
print('Events dataframe:')
new_events

Events dataframe:


Unnamed: 0_level_0,hashtag,latitude,longitude,numberOfTweets,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-05-04,starwarsday,46.19295,6.471575,12,C'est vêtu de mon T-Shirt SW que je vous souha...
2016-07-21,スイス情報,46.96320,7.466670,28,#ベルナーオーバーライド #スイスアルプス #ヒマワリ と共に。#アイガー #メンヒ #ユン...
2013-02-10,winterwonderland,47.03460,8.303840,11,#winterwonderland @ Schneestern http://t.co/Ji...
2013-12-27,winterwonderland,46.49730,8.138090,7,#WinterWonderland - oder nicht?\n#Davos @ Golf...
2014-12-27,winterwonderland,46.48330,7.561030,9,#winterwonderland @ Berg (Dägerlen) http://t.c...
2014-12-28,winterwonderland,47.31875,8.544700,16,Im lovin it!!! #winterwonderland http://t.co/f...
2014-12-29,winterwonderland,46.97250,7.747200,15,Todays #winterwonderland at http://t.co/HiBFem...
2014-12-30,winterwonderland,47.17580,8.225200,9,On the way to work today #WinterWonderland #Zu...
2015-11-25,winterwonderland,46.98860,8.248910,7,Snowy lake view from the train station this mo...
2016-01-04,winterwonderland,46.32450,7.511010,8,Last day in #Winterwonderland ❄️❄️ @gstaadpala...


In [75]:
print('Linked dataframe of all days:')
dictionary[new_events.iloc[0].hashtag].head(10)

Linked dataframe of all days:


Unnamed: 0,longitude,numberOfTweets,text,latitude,hashtag,event
2012-05-04,9.14467,5,May the 4th be with you. #StarWarsDay_$$$_Obli...,47.0388,starwarsday,False
2013-05-04,6.927115,4,May the fourth be with you !\n\n#starwarsday h...,46.46465,starwarsday,False
2013-05-25,8.77879,1,May the force be with you! #starwarsweekend #s...,47.3933,starwarsday,False
2014-04-28,6.22669,1,Un petit coucou mes Padawans pour vous souhait...,46.1929,starwarsday,False
2014-05-03,7.15569,1,May the fourth be with you!\n#StarWarsDay,46.7954,starwarsday,False
2014-05-04,6.471575,12,C'est vêtu de mon T-Shirt SW que je vous souha...,46.19295,starwarsday,True
2014-05-05,7.9101,1,Good morning Twitter people! After successfull...,47.3473,starwarsday,False
2015-05-04,6.63486,3,May the 4th be with you. #StarWarsDay #Lausann...,46.5225,starwarsday,False
2015-11-20,8.55167,1,Arbeiten ist schön!!! #srfdgst #theforceawaken...,47.4114,starwarsday,False
2016-05-04,6.88294,1,May the Fourth be with you! Happy #starwarsday...,46.4429,starwarsday,False


## Exporting data

In [76]:
e_df = new_events.copy()
e_df['date'] = e_df.index
e_df.index = [i for i in range (len(e_df))]
e_df.head(1)

Unnamed: 0,hashtag,latitude,longitude,numberOfTweets,text,date
0,starwarsday,46.19295,6.471575,12,C'est vêtu de mon T-Shirt SW que je vous souha...,2014-05-04


We are going to generate the right datetimes for the jsons:

In [77]:
epoch_dt = datetime(1970, 1, 1)
def to_utc(date):
    d_dt = datetime.combine(date, datetime.min.time())
    return int((d_dt - epoch_dt).total_seconds()*1000)

In [78]:
def convert_to_unix_time(record):
    datetime_index = pd.DatetimeIndex([datetime(record['year'], record['month'], 1)])
    unix_time_index = datetime_index.astype(np.int64) // 10**6
    return unix_time_index[0]

In [79]:
pr('Converting dates...')
e_df['year'] = e_df['date'].apply(lambda x: x.year)
e_df['month'] = e_df['date'].apply(lambda x: x.month)
e_df['utc_date'] = e_df['date'].apply(lambda x: to_utc(x))
e_df['unix_time'] = e_df.apply(convert_to_unix_time, axis=1)
pr('Done.')
e_df.head(1)

12:44:47 Converting dates...
12:44:50 Done.


Unnamed: 0,hashtag,latitude,longitude,numberOfTweets,text,date,year,month,utc_date,unix_time
0,starwarsday,46.19295,6.471575,12,C'est vêtu de mon T-Shirt SW que je vous souha...,2014-05-04,2014,5,1399161600000,1398902400000


Grouping by months and generating the dictionary for the json.

In [80]:
e_gb_month = e_df.groupby(e_df.unix_time)

In [81]:
pr('Making event list...')
months = []
for month, df in e_gb_month:
    days = []
    for i in range (len(df)):
        ht = df.iloc[i]['hashtag']
        lat = df.iloc[i]['latitude']
        lon = df.iloc[i]['longitude']
        t_num = df.iloc[i]['numberOfTweets']
        tweets = df.iloc[i]['text'].split(delimiter)
        date = df.iloc[i]['utc_date']
        
        data_unit = { 'name': ht
                    , 'latitude' : lat
                    , 'longitude' : lon
                    , 'tweets' : tweets
                    , 'number_of_tweets' : str(t_num)
                    , 'date' : int(date)}
        days.append(data_unit)
    
    curr_month = {'date': int(month), 'data' : days}
    months.append(curr_month)

final_events = {'events' : months}
pr('Done.')

12:44:50 Making event list...
12:44:59 Done.


In [82]:
exportFilename = 'export_twitter_events_' + datetime.now().strftime("%Y-%m-%d_%Hh%Mmin%S") + '.json'
exportPath =  os.path.join('data', exportFilename)

pr('Exporting to json...')
with open(exportPath, 'w') as f:
     json.dump(final_events, f)
pr('Export done. File "{}" has been created.'.format(exportFilename))

12:44:59 Exporting to json...
12:45:00 Export done. File "export_twitter_events_2017-01-30_12h44min59.json" has been created.
