In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from functools import reduce
import math
from collections import Counter
import csv
from datetime import datetime
%matplotlib inline

In [2]:
def now():
    return str(datetime.now().time())[:8]
def pr(strToPrint):
    print(now() + ' '+ strToPrint)

from IPython.display import Audio
sound_file = 'beep.wav'
# Used by other groups: pd.read_csv(data_path, sep="\t",encoding='utf-8',  escapechar='\\', quoting=csv.QUOTE_NONE, header=None, na_values='N')

In [3]:
pickle_filename = os.path.join('data','head_100k_pickle.pkl')
tw = pd.read_pickle(pickle_filename)

In [4]:
tw.shape

(1000000, 20)

In [5]:
def extract_hashtags(text):
    if '#' not in text:
        return None
    return [tag.strip("#") for tag in text.split() if tag.startswith("#")]

In [6]:
tw['hashtag'] = np.nan
tw.head()
tw.hashtag = tw.text.apply(lambda x: extract_hashtags(str(x)))
twh = tw[pd.notnull(tw['hashtag'])]
len(twh)

79537

In [7]:
tw1 = twh.dropna(axis=0, how='any', subset=['text', 'createdAt'])
print('The data have been reduced from {} tweets to {} tweets.'.format(len(twh), len(tw1)))

The data have been reduced from 79537 tweets to 79537 tweets.


In [8]:
pr('Removing bad dates...')
twhCleanDate = tw1[tw1['createdAt'].str.len() == 19]
pr('Finished.')

10:33:13 Removing bad dates...
10:33:13 Finished.


In [9]:
pr('Starting to examine dates...')
import warnings
warnings.filterwarnings('ignore')
datetime_serie = twhCleanDate['createdAt'].convert_objects(convert_dates='coerce')
dateNotConvertible = datetime_serie[pd.isnull(datetime_serie)]
warnings.filterwarnings('default')
pr('There are {} dates that cannot be transformed.'.format(len(dateNotConvertible)))

10:33:21 Starting to examine dates...
10:33:21 There are 0 dates that cannot be transformed.


In [10]:
pr('Starting copy...') # (to avoid transformation problems)
tw5 = twhCleanDate.copy()
pr('Converting to datetime...')
tw5['createdAt'] = pd.to_datetime(twhCleanDate['createdAt'])
pr('Setting up new indices...')
tw5.index = tw5['createdAt']
pr('Deleting old "createdAt" column...')
del tw5['createdAt']
pr('Done!')
tw5.head(2)

10:33:21 Starting copy...
10:33:21 Converting to datetime...
10:33:21 Setting up new indices...
10:33:21 Deleting old "createdAt" column...
10:33:21 Done!


Unnamed: 0_level_0,id,userId,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-02-23 09:59:41,9519737890,14657884.0,"Magic spells run off after midnight, I guess s...",6.1387,46.175,,,1.0,,,,,,,,,,,,[fb]
2010-02-23 11:28:27,9521789689,9962022.0,"Limitas of public transportation! No taxi, rai...",6.33641,46.4631,,,550.0,,,,,,,,,,,,[yam]


In [11]:
twt = tw5.head()
twt.hashtag

createdAt
2010-02-23 09:59:41               [fb]
2010-02-23 11:28:27              [yam]
2010-02-23 17:47:11         [24, VfB.]
2010-02-23 18:19:03    [iPhoneography]
2010-02-23 18:31:46     [partnermonth]
Name: hashtag, dtype: object

In [12]:
for [i, h] in [twt.index, twt.hashtag]:
    print(i)

ValueError: too many values to unpack (expected 2)

In [13]:
tw5.hashtag = tw5.hashtag.apply(lambda x: ', '.join(x))
tw5.hashtag.head()

createdAt
2010-02-23 09:59:41               fb
2010-02-23 11:28:27              yam
2010-02-23 17:47:11         24, VfB.
2010-02-23 18:19:03    iPhoneography
2010-02-23 18:31:46     partnermonth
Name: hashtag, dtype: object

In [16]:
dictionary = dict()
gp = tw5.groupby('hashtag')
gp.get_group('Hollande')

Unnamed: 0_level_0,id,userId,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,hashtag
createdAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2012-01-23 07:59:09,1.6135728719213363e+17,16558056.0,"@ParisMatch comme il se doit ?? Pourquoi, Fran...",6.63335,46.5213,6c07f3233c333f95,1.611614e+17,1.0,,,,,,,,,,,,Hollande
2012-04-20 11:29:39,1.933e+17,83416937.0,"“@Marion_Z : ""Les françaises sont belles mais ...",6.8658,46.5243,5a54953440569858,,24.0,,,,,,,,,,,,Hollande
2012-04-22 18:31:54,1.9413142924283085e+17,465903153.0,Mtn imagine Hollande il va au Pays-Bas. A l'aé...,6.91641,46.6871,05e230dbb91aa51f,,5.0,,,,,,,,,,,,Hollande
2012-04-22 19:24:09,1.9414458142255923e+17,551068304.0,je pense que c'est génial que #Hollande ne par...,,,c3a6437e1b1a726d,,2.0,,,,,,,,,,,,Hollande
2012-04-26 20:14:49,1.9560688327552205e+17,50337597.0,@SARKOZY_2012 président franchement il est pl...,6.60217,46.5868,fdeba41ff7f777ff,,1.0,,,,,,,,,,,,Hollande
2012-05-01 12:44:56,1.9730560367946138e+17,83416937.0,"@nk_m traité sournoisement #Hollande de : ""Ren...",6.86625,46.5242,5a54953440569858,,24.0,,,,,,,,,,,,Hollande
2012-05-01 12:47:08,1.9730615847263846e+17,83416937.0,"@nk_m traite sournoisement #Hollande de : ""Ren...",6.86582,46.5242,5a54953440569858,,24.0,,,,,,,,,,,,Hollande
2012-05-02 10:02:56,1.9762722538481664e+17,326162754.0,#Hollande sait que s'il remonte la TVA sur la ...,6.16987,47.6283,e5695aeb3c96effd,,20.0,,,,,,,,,,,,Hollande
2012-05-02 11:34:47,1.976503382018621e+17,326162754.0,"""@zebulon577: La secrétaire de Francois #Holl...",6.43175,47.6458,9b6e8fa88c5750bf,,20.0,,,,,,,,,,,,Hollande
2012-05-02 17:40:45,1.977424364900311e+17,326162754.0,#Hollande ne veut pas toucher a la taxe sur le...,6.13542,47.6222,6aaabd05b778b2f6,,20.0,,,,,,,,,,,,Hollande


In [15]:
for hashtag in tw5.hashtag:
    if len(gp.get_group(hashtag)) > 5:
        dictionary[hashtag] = gp.get_group(hashtag)

In [62]:
len(dictionary)

1319

In [63]:
t = 0
for [k,ts] in dictionary.items():
    t = t + 1
    if (t%1000 == 0):
        print(t)
    for [i,v] in ts['id'].resample('D').count().iteritems():
        limit = ts['id'].resample('D').count().mean() + 3 * ts['id'].resample('D').count().std()
        if v >= limit:
            print("For", k, ":")
            print("    event at " , i)

For  :
    event at  2012-04-19 00:00:00
For  :
    event at  2012-04-20 00:00:00
For  :
    event at  2012-04-21 00:00:00
For  :
    event at  2012-04-22 00:00:00
For  :
    event at  2012-04-23 00:00:00
For  :
    event at  2012-04-24 00:00:00
For  :
    event at  2012-04-25 00:00:00
For  :
    event at  2012-05-02 00:00:00
For  :
    event at  2012-05-05 00:00:00
For  :
    event at  2012-05-06 00:00:00
For  :
    event at  2012-05-07 00:00:00
For  :
    event at  2012-05-08 00:00:00
For  :
    event at  2012-05-09 00:00:00
For  :
    event at  2012-05-10 00:00:00
For  :
    event at  2012-05-11 00:00:00
For  :
    event at  2012-05-13 00:00:00
For  :
    event at  2012-05-15 00:00:00
For  :
    event at  2012-05-16 00:00:00
For  :
    event at  2012-05-17 00:00:00
For  :
    event at  2012-05-19 00:00:00
For  :
    event at  2012-05-20 00:00:00
For argh :
    event at  2012-05-02 00:00:00
For CL :
    event at  2010-03-10 00:00:00
For CL :
    event at  2012-03-13 00:00:00
For CL :

KeyboardInterrupt: 