In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from functools import reduce
import math
from collections import Counter
import csv
from datetime import datetime
import re
import copy
import json
%matplotlib inline

In [2]:
def now():
    return str(datetime.now().time())[:8]
def pr(strToPrint):
    print(now() + ' '+ strToPrint)

from IPython.display import Audio
sound_file = 'beep.wav'
# pd.set_option('display.max_colwidth', -1)

## Importing data

Importing a sample of the dataset:

In [3]:
pickle_filename = os.path.join('data','head_100k_pickle.pkl')
tw = pd.read_pickle(pickle_filename)

Importing the whole dataset

In [4]:
columns_header = ['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId',
                  'inReplyTo', 'source', 'truncated', 'placeLatitude', 'placeLongitude', 'sourceName', 'sourceUrl',
                 'userName', 'screenName', 'followersCount', 'friendsCount', 'statusesCount',
                 'userLocation']

filename = os.path.join('data','twex.tsv') # 'sample.tsv')
pr('Starting to read file...')
# tw = pd.read_csv(filename, sep='\t', encoding='utf-8', escapechar='\\', names=columns_header,
#                       quoting=csv.QUOTE_NONE, na_values='N', header=None)

pr('File is loaded!')
# Audio(url=sound_file, autoplay=True)

16:30:31 Starting to read file...
16:30:31 File is loaded!


In [5]:
tw.head(2)

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation
0,9514097914,17341000.0,2010-02-23 05:55:51,Guuuuten Morgen! :-),7.43926,46.9489,,,197,,,,TwitBird,http://www.nibirutech.com,Tilman Jentzsch,blickwechsel,586,508.0,9016.0,"Bern, Switzerland"
1,9514846412,7198280.0,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,,,550,,,,Gowalla,http://gowalla.com/,Nico Luchsinger,halbluchs,1820,703.0,4687.0,"Zurich, Switzerland"


## Extrating hashtags

In [6]:
def extract_hashtags(text):
    ht_list = re.findall(r"#(\w+)", text)
    non_empty_hts = list(filter((lambda ht: ht != []), ht_list))
    lowerCharList = [ht.lower() for ht in non_empty_hts]
    return lowerCharList

In [7]:
pr('Making hashtags...')
tw['hashtag'] = np.nan
tw.hashtag = tw.text.apply(lambda x: extract_hashtags(str(x)))
twh = tw.ix[tw.hashtag.apply(lambda x: len(x) != 0)]
pr('Done.')

16:30:31 Making hashtags...
16:30:31 Done.


In [8]:
twh.head(2)

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
8,9519737890,14657900.0,2010-02-23 09:59:41,"Magic spells run off after midnight, I guess s...",6.1387,46.175,,,1,,...,,Twitter for iPhone,http://twitter.com/#!/download/iphone,Javier Belmonte,vichango,167,277.0,2885.0,"Geneva, Switzerland",[fb]
10,9521789689,9962020.0,2010-02-23 11:28:27,"Limitas of public transportation! No taxi, rai...",6.33641,46.4631,,,550,,...,,Gowalla,http://gowalla.com/,Thomas Winter,thwinter,1070,1359.0,3349.0,Hettlingen CH / SanJose Ca,[yam]


## Cleaning data and making date index

In [9]:
tw1 = twh.dropna(axis=0, how='any', subset=['text', 'createdAt'])
print('The data have been reduced from {} tweets to {} tweets.'.format(len(twh), len(tw1)))

The data have been reduced from 19719 tweets to 19719 tweets.


In [10]:
pr('Removing bad dates...')
twhCleanDate = tw1[tw1['createdAt'].str.len() == 19]
pr('Finished.')

16:30:31 Removing bad dates...
16:30:32 Finished.


In [11]:
pr('Starting to examine dates...')
import warnings
warnings.filterwarnings('ignore')
datetime_serie = twhCleanDate['createdAt'].convert_objects(convert_dates='coerce')
dateNotConvertible = datetime_serie[pd.isnull(datetime_serie)]
warnings.filterwarnings('default')
pr('There are {} dates that cannot be transformed.'.format(len(dateNotConvertible)))

16:30:32 Starting to examine dates...
16:30:32 There are 0 dates that cannot be transformed.


In [12]:
pr('Starting copy...') # (to avoid transformation problems)
tw5 = twhCleanDate.copy()
pr('Converting to datetime...')
tw5['createdAt'] = pd.to_datetime(twhCleanDate['createdAt'])
pr('Setting up new indices...')
tw5.index = tw5['createdAt']
pr('Deleting old "createdAt" column...')
del tw5['createdAt']
pr('Done!')
tw5.head(2)

16:30:32 Starting copy...
16:30:32 Converting to datetime...
16:30:32 Setting up new indices...
16:30:32 Deleting old "createdAt" column...
16:30:32 Done!


Unnamed: 0_level_0,id,userId,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-02-23 09:59:41,9519737890,14657900.0,"Magic spells run off after midnight, I guess s...",6.1387,46.175,,,1,,,,Twitter for iPhone,http://twitter.com/#!/download/iphone,Javier Belmonte,vichango,167,277.0,2885.0,"Geneva, Switzerland",[fb]
2010-02-23 11:28:27,9521789689,9962020.0,"Limitas of public transportation! No taxi, rai...",6.33641,46.4631,,,550,,,,Gowalla,http://gowalla.com/,Thomas Winter,thwinter,1070,1359.0,3349.0,Hettlingen CH / SanJose Ca,[yam]


In [13]:
tw5['hashtag'][:6]

createdAt
2010-02-23 09:59:41               [fb]
2010-02-23 11:28:27              [yam]
2010-02-23 17:47:11          [24, vfb]
2010-02-23 18:19:03    [iphoneography]
2010-02-23 18:31:46     [partnermonth]
2010-02-24 06:09:23      [insider, fb]
Name: hashtag, dtype: object

## Let's put one hashtag per row

We will make a dataframe with one row = one hashtag. This will be done by going through the dataframe, and making in parallel a list of rows (with 1 hashtag per row) that needs to be added to the old dataframe.

In [14]:
addedHashtagsRowsList = []
def multiplyHashtagRows(row, columns):
    '''
    Examine each row. If there are multiple hashtags, it will return the first one.
    (so the first one will replace the list of hashtags in the df). Then for all the next ones,
    it will make a copy of the row in the addedHashtagsRowsList, (in a dictionary format).
    So this dictionary can in the end be transformed in a DF and added to the original DF.
    (The speed is increased a lot by doing it this way!)
    '''
    htList = row.hashtag
    if len(htList) > 1:
        ## Making the dictionary
        addedHashtag = {}
        addedHashtag['createdAt'] = row.name #the df index
        for col in columns:
            addedHashtag[col] = row[col]
        ## Copying the dict for each hashtag
        i = 1
        while i < len(htList) :
            deepCopy = copy.deepcopy(addedHashtag)
            deepCopy['hashtag'] = htList[i]
            addedHashtagsRowsList.append(deepCopy)
            i+=1
    return htList[0] # return the first hashtag

In [15]:
addedHashtagsRowsList = []
tw5_1 = tw5.copy()
pr('Multiplying the hashtag rows... (around 30min)')
tw5_1['hashtag'] = tw5.apply(multiplyHashtagRows, args=[tw5.columns,], axis=1)
pr('Finished! {} rows will be added to the dataframe!'.format(len(addedHashtagsRowsList)))

16:30:32 Multiplying the hashtag rows... (around 30min)
16:30:37 Finished! 8887 rows will be added to the dataframe!


In [16]:
pr('Starting to make the new dataframe with additionnal rows..')
addedHashtagsDf = pd.DataFrame(addedHashtagsRowsList)
addedHashtagsDf.set_index(['createdAt'], inplace=True)
pr('Starting to append the two df... Old df size = {}'.format(len(tw5_1)))
tw6 = tw5_1.append(addedHashtagsDf)
pr('Done! New df size = {}'.format(len(tw6)))

16:30:37 Starting to make the new dataframe with additionnal rows..
16:30:38 Starting to append the two df... Old df size = 19719
16:30:38 Done! New df size = 28606


In [17]:
print('Example hahshtag:')
tw6[tw6['hashtag'] == addedHashtagsRowsList[0]['hashtag']].head(3)

Example hahshtag:


Unnamed: 0_level_0,followersCount,friendsCount,hashtag,id,inReplyTo,latitude,longitude,placeId,placeLatitude,placeLongitude,screenName,source,sourceName,sourceUrl,statusesCount,text,truncated,userId,userLocation,userName
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2011-07-24 16:03:13,1121,763.0,vfb,95162125584039936,9.501397e+16,,,e401fb8eb4e7595a,47.5356,9.14004,eLd0raDo,14,Tweetbot for iOS,http://tapbots.com/tweetbot,6735.0,"@mikstweed Wie lange ist es her, dass der #vfb...",,921241,"Kreuzlingen, TG, Switzerland",Markus Tressl
2010-02-23 17:47:11,1121,763.0,vfb,9535390586,,47.6463,9.1657,,,,eLd0raDo,550,Gowalla,http://gowalla.com/,6735.0,"So, Feierabend. Jetzt #24 und später #VfB. — a...",,921241,"Kreuzlingen, TG, Switzerland",Markus Tressl
2010-12-01 17:25:56,1317,830.0,vfb,10021795398684672,,46.9499,7.47071,e38a1a641d02f8db,46.9543,7.39491,chm,1,Twitter for iPhone,http://twitter.com/#!/download/iphone,13552.0,"So, und jetzt Flachmann suchen und dann ab ins...",,120433,"Bern, Switzerland",chm


In [18]:
tw6.hashtag.head()

createdAt
2010-02-23 09:59:41               fb
2010-02-23 11:28:27              yam
2010-02-23 17:47:11               24
2010-02-23 18:19:03    iphoneography
2010-02-23 18:31:46     partnermonth
Name: hashtag, dtype: object

## Aggregation of rows by date

In [19]:
tw6.dropna(subset=['longitude'], inplace=True)
tw6.dropna(subset=['latitude'], inplace=True)

In [20]:
tw6.latitude = tw6.latitude.apply(float)
tw6.longitude = tw6.longitude.apply(float)

In [21]:
delimiter = '_$$$_'
str_join = lambda x: delimiter.join(x)

Function that applies to a dataframe will group each row by day and aggregate all its content:

In [22]:
def aggDate(df):
    groupedDf = df.groupby(df.index.map(lambda x: x.date)).agg({'text' : str_join,
                                                          'longitude' : np.median,
                                                          'latitude' : np.median,
                                                          'hashtag' : lambda x: x.iloc[0], ## the first occurence
                                                          'numberOfTweets' : 'count'})
    return groupedDf

In [23]:
pr('Making column number of tweets')
tw6['numberOfTweets'] = 1
pr('Starting group by hastag...')
gp = tw6.groupby('hashtag')
pr('Starting to put hashtag in dictionary... (around 6.5 hours?!?)')
dictionary = {hashtag : aggDate(df) for hashtag, df in gp}
pr('Finished operations! Dictionary with {} different hashtags.'.format(len(dictionary)))

16:30:38 Making column number of tweets
16:30:38 Starting group by hastag...
16:30:38 Starting to put hashtag in dictionary... (around 6.5 hours?!?)
16:31:40 Finished operations! Dictionary with 9410 different hashtags.


In [48]:
# aggDate(gp.get_group('twibisg')).head(3)
# dictionary['twibisg'].head(3)
print('Dictionary with hashtags dataframes:')
dictionary[list(dictionary.keys())[5]].head(4)

Dictionary with hashtag:


Unnamed: 0,text,hashtag,numberOfTweets,longitude,latitude,event
2011-09-02,#ebc11 #OBA (@ OLMA-Messen) http://t.co/LC8wcR...,ebc11,5,9.38398,47.4317,False
2011-09-03,Nach der gestrigen #ebc11 heute also Vortrag z...,ebc11,1,9.38398,47.432,False


## Starting event detection

In [25]:
## Parameters of an event:
MIN_TOT_NB_TWEETS = 20 ## The hashtag must have happened at least this number of times to be considered.
MIN_NB_DAYS_WITH_HASHTAGS = 3 ## The hashtags must appear at least this number of different days to be considered.
MIN_NB_TWEETS_DURING_EVENT = 5 ## To be considered an event, the hashtag must happen at least this nb of times during the day.
THRESHOLD_ANOMALY_FACTOR = 2 ## The occurence of a hashtag during a single day must be above the mean by this FACTOR
                             ## multiplied by the std to be considered as an anomaly.

In [26]:
pr('Starting to compute {} dict items to detect event.'.format(len(dictionary)))
nbOfEventDetected = 0
count = 0
printingValue = int(len(dictionary) / 10)
for [h,df] in dictionary.items():
    count += 1
    if count % printingValue == 0:
        pr("{:.0f}%".format(count/len(dictionary)*100))
    df['event'] = False
    if len(df) > MIN_NB_DAYS_WITH_HASHTAGS:
        if df['numberOfTweets'].sum() > MIN_TOT_NB_TWEETS:
            threshold = df['numberOfTweets'].mean() + THRESHOLD_ANOMALY_FACTOR * df['numberOfTweets'].std()
            df['event'] = df.numberOfTweets.apply(lambda x: x > threshold and x > MIN_NB_TWEETS_DURING_EVENT)
            nbOfEventDetected += len(df[df['event']])
pr('Finished! Number of events detected = {}'.format(nbOfEventDetected))

16:31:40 Starting to compute 9410 dict items to detect event.
16:31:41 10%
16:31:41 20%
16:31:42 30%
16:31:42 40%
16:31:43 50%
16:31:43 60%
16:31:44 70%
16:31:44 80%
16:31:45 90%
16:31:46 100%
16:31:46 Finished! Number of events detected = 43


## Making single event dataframe

In [31]:
eventRowsList = []
def applyToMakeEventDf(row):
    if row.event:
        rowToAdd = {'date': row.name, 'hashtag': row.hashtag, 'text': row.text,
                    'longitude': row.longitude, 'latitude':row.latitude, 'numberOfTweets': row.numberOfTweets, }
        eventRowsList.append(rowToAdd)

In [32]:
eventRowsList = []
count = 0
printingValue = int(len(dictionary) / 10)

pr('Starting to make event df with {} dataframes. (around 30 min)'.format(len(dictionary)))
for h, df in dictionary.items():
    count += 1
    if count % printingValue == 0:
        pr("{:.0f}%".format(count/len(dictionary)*100))
    df.apply(applyToMakeEventDf, axis=1)

pr('Making new dataframe.')
new_events = pd.DataFrame(eventRowsList)
new_events.set_index(['date'], inplace=True)
pr('Finished! Dataframe with {} rows'.format(len(new_events)))

16:31:46 Starting to make event df with 9410  df. (around 30 min)
16:31:47 10%
16:31:47 20%
16:31:48 30%
16:31:49 40%
16:31:50 50%
16:31:50 60%
16:31:51 70%
16:31:52 80%
16:31:52 90%
16:31:53 100%
16:31:53 Making new dataframe.
16:31:53 Finished! Dataframe with 43 rows


In [51]:
print('Events dataframe:')
new_events.head(10)

Events dataframe:


Unnamed: 0_level_0,hashtag,latitude,longitude,numberOfTweets,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-05-27,velo,47.1438,8.89606,13,20 % progress …Pause! #Velo #Heimfahrt_$$$_25 ...
2010-07-30,iphone4,47.3754,8.5388,10,"Ok, here's the queue. 05:56 CET #iPhone4 #aszh..."
2011-08-09,twibisg,47.4841,9.54905,6,genau die gleiche frage wollte ich auch an @va...
2011-03-25,ipad2,47.3751,8.53896,41,@greezer der @ThBenkoe hat's schon #iPad2 ht...
2011-10-23,ew11,47.4077,8.68466,14,bin jetzt schon genervt #ew11_$$$_ich lass mir...
2011-08-02,android,47.5,8.75,6,@AniseSmith oh no! But that means I have to b...
2010-07-07,ger,47.4769,8.303375,8,Bin so langsam doch etwas nervös... #ger :-)_$...
2010-12-08,switzerland,47.3638,8.53493,9,India's cell-phone marketing: 545 million hand...
2011-09-07,smgzh,47.3649,8.54687,13,"Läck! Ist das früh heute... #smgzh_$$$_#smgzh,..."
2010-07-03,twittboat,47.3756,8.542245,12,Jetzt höre ich schon NRJ um mich mental auf al...


In [56]:
print('Linked dataframe of all days:')
dictionary[new_events.iloc[0].hashtag].head(10)

Linked dataframe of all days:


Unnamed: 0,text,hashtag,numberOfTweets,longitude,latitude,event
2010-03-04,Ich habe wohl ein Rad ab\n#Velo #Reparatur #al...,velo,1,8.6468,46.8578,False
2010-05-28,Fahr doch am Strassenrad du blöde Kuh! #velo #...,velo,1,7.44723,46.9544,False
2010-06-28,"so scheissen alle auf ein ""richterliches Verbo...",velo,1,7.49171,46.9237,False
2010-11-11,ach es regnet.. wie nett.. #velo,velo,1,8.54675,47.4003,False
2011-05-27,20 % progress …Pause! #Velo #Heimfahrt_$$$_25 ...,velo,13,8.89606,47.1438,True
2011-05-29,nach Hase mit dem #Velo nr. 2,velo,1,8.95836,47.1854,False
2011-06-12,PowerBar-Ergänzungsnahrung @ Andermatt #velo ...,velo,1,8.59601,46.6348,False
2011-07-03,.@ursbucher genau... und jetzt schon am walens...,velo,1,9.25076,47.1131,False
2011-07-09,hüpft zur abwechslung wieder mal aufs #velo h...,velo,1,9.45728,46.8507,False
2011-07-15,Schaltung gerissen - öfters mal was neues... #...,velo,1,8.74485,47.5002,False


## Exporting data

In [57]:
e_df = new_events.copy()
e_df['date'] = e_df.index
e_df.index = [i for i in range (len(e_df))]
e_df.head(1)

Unnamed: 0,hashtag,latitude,longitude,numberOfTweets,text,date
0,velo,47.1438,8.89606,13,20 % progress …Pause! #Velo #Heimfahrt_$$$_25 ...,2011-05-27


In [38]:
e_gb = e_df.groupby(e_df.date)

In [39]:
'latitude:{0}, longitude:{1}, tweets: [{2}], number_of_tweets : {3} '.format('a', 'b', 'c', 'd')

'latitude:a, longitude:b, tweets: [c], number_of_tweets : d '

In [40]:
tweet_unit_template = 'name:{0}, latitude : {1}, longitude : {2}, \
            tweets: [{3}], number_of_tweets : {4}'

data_template = 'data: [{0}]' # put 'tweet_unit, ...'
event_template = 'date:{0}, {1}' #put data to {1}

In [60]:
events = []
pr('Making event list...')
for date, df in e_gb:
        units = []
        for i in range (len(df)):
            ht = df.iloc[i]['hashtag'].split(delimiter)[0]
            lat = df.iloc[i]['latitude']
            lon = df.iloc[i]['longitude']
            t_num = df.iloc[i]['numberOfTweets']
            tweets = df.iloc[i]['text'].replace(',', '').split(delimiter)
            
            data_unit = { 'name': ht
                         , 'latitude' : lat
                         , 'longitude' : lon
                         , 'tweets' : tweets
                         , 'number_of_tweets' : t_num }
            units.append(data_unit)
        
        date_dict = {}
        #date_json = 'date: {0}'.format(date)
        #data_json = data_template.format(','.join(units))
        curr_event = {'date': date.isoformat(), 'data': units}
        events.append(curr_event)

final_events = {'events' : events}
pr('Finished.')

16:44:16 Making event list...
16:44:16 Finished.


In [59]:
final_events

{'events': [{'data': [{'latitude': 46.9377,
     'longitude': 7.4191899999999995,
     'name': 'fb',
     'number_of_tweets': 6,
     'tweets': ['watching a classic 1979\'s Alien: "In space no one can hear you scream" .. right after breakfast!! [=o)] #fb',
      'On my way to Fotobörse at Kursaal Bern and then meeting with potential client. #fb',
      'Very good and inspiring client meeting this afternoon. Have to plan some shootings now over the next weeks. #fb',
      'Hmm hat jemand von euch "das weisse Band" gesehen ... und verstanden? #fb',
      "also going to bed now.. it doesn't get less original [=oP] #fb",
      'Erste Folge der zweiten Staffel von Fringe gesehen! ziemlich interessant und spannend!! #fringe #serienjunkie #fb']}],
   'date': '2010-03-07'},
  {'data': [{'latitude': 47.378799999999998,
     'longitude': 8.5380300000000009,
     'name': 'sui',
     'number_of_tweets': 25,
     'tweets': ['#sui wonderful',
      "en plus du goal j'aurai mis rouge ... Hop #sui !!!

In [61]:
pr('Exporting to json...')
with open('data/export.json', 'w') as f:
     json.dump(final_events, f)
pr('Export done.')

16:44:20 Exporting to json...


TypeError: 6 is not JSON serializable