We use <b> chi squared score </b> to measure the trend in the twitter. 
The formula , use E for Expectation, and O for observation<br>
$$
\chi^2 \text{score}= \left\{\begin{array}{lr}
        \frac{(O-E)^2}{E}, & \text{if } O > E\\
        0, & \text{otherwise }
        \end{array}
        \right.
$$

Zero division error will araise if E = 0, so we subsititude E with 1 in this case. <br>

As some low frequency terms may have very large chi squared score, a threshold of 15 is used to
filter these hashtags.

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import json
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import time
import pickle
import collections
import os
import re

In [3]:
with open("data/twitter-swisscom/twitter/df1_reduced.pickle", "rb") as f:
    df1 = pickle.load(f)

In [4]:
with open("data/twitter-swisscom/twitter/df2_reduced.pickle", "rb") as f:
    df2 = pickle.load(f)

In [5]:
with open("data/twitter-swisscom/twitter/df3_reduced.pickle", "rb") as f:
    df3 = pickle.load(f)

In [6]:
with open("data/twitter-swisscom/twitter/df4_reduced.pickle", "rb") as f:
    df4 = pickle.load(f)

In [7]:
with open("data/twitter-swisscom/user/user_total.pickle", "rb") as f:
    user = pickle.load(f)

In [8]:
df4['day'] = df4['createdAt'].apply(lambda x: x[:10])

## Base and Tags

Use df1, df2, df3 (date before 2015) as base, the df4 is investigate to detect changes.

In [1]:
def count_hash_tags(series):
    """ Count the number of hashtags (in lower cases)"""
    d = collections.defaultdict(int)
    
    def count_a_list(l):
        for i in l:
            d[i.lower()] += 1
            
    series.apply(count_a_list)
    return d

In [10]:
d1 = count_hash_tags(df1['hashtags'])
d2 = count_hash_tags(df2['hashtags'])
d3 = count_hash_tags(df3['hashtags'])
d4 = count_hash_tags(df4['hashtags'])

Merge dictionaries

In [11]:
num_base_twitters = df1.shape[0] + df2.shape[0] + df3.shape[0]

In [12]:
base_dict = collections.defaultdict(int, {k: d1[k] + d2[k] + d3[k] for k in set(d1) | set(d2)| set(d3)})

In [2]:
def sort_dict_values(d, reverse=False):
    """sort dictionary by their values"""
    return sorted(d.items(), key=lambda x: x[1], reverse=reverse)

# Detect Topic in Dict

In [14]:
from scipy.stats import chisquare

In [4]:
def topic_detection_by_chi_square(past, future, N, future_length, n=20):
    """ Detect topics/events using chi squared score
    
    First compute the expected number from population `past`. If the expected
    value is 0, we use 1 instead. If the expected number of a topic is smaller than
    the one we observed (future), a potential event appears. Then we compute the 
    chi squared scores and sort them in the descending order.
    """
    all_keys = set(future.keys()) | set(past.keys())
    chi_squared_score = collections.defaultdict(float)

    for k,v in future.items():
        if N == 0:
            expected = past[k] * future_length or 1
        else:        
            expected = past[k] / N * future_length or 1
            
        if v > expected and v > n:
            # -0.5 is the Yates' correction to filter the low frequencies terms.
            chi_squared_score[k] = (np.abs(expected - v)-0.5) ** 2 / expected
        
    return sort_dict_values(chi_squared_score, True)

# Overal Topic Trend

In [16]:
total_N = num_base_twitters

In [17]:
total_base_dict = base_dict.copy()

In [5]:
def overal_topic(date_df, total_base_dict, total_N, n=20):
    """compute all the chi squared score from base datasets to a day's dataset"""
    num_twitters_in_a_day = date_df.shape[0]
    future = count_hash_tags(date_df.hashtags)
    
    chi_squared_dict = topic_detection_by_chi_square(total_base_dict, future, total_N, num_twitters_in_a_day, n)
    
    for k,v in future.items():
        total_base_dict[k] += v

    return chi_squared_dict[:5], total_N + num_twitters_in_a_day

In [21]:
days = df4.day.unique()

In [20]:
v = []
for i, day in zip(range(10000), days):
    df = df4[df4.day == day]
    a = [day]
    result, total_N = overal_topic(df, total_base_dict, total_N)
    a.extend(result)
    v.append(a)

KeyboardInterrupt: 

In [344]:
overall_topics = pd.DataFrame(v)

In [700]:
overall_topics.head()

Unnamed: 0,date,1,2,3,4,5
0,2014-12-28,"(davidebraccialetti, 123261065.614)","(braccialettirossi, 1090408.47591)","(switzerlandneedsotrat, 3840.32787285)","(atamadafelsefeyiunutma, 1332.25)","(switzerland, 14.3109812746)"
1,2014-12-29,"(switzerlandneedsotratour, 33731885.168)","(wewantotrainitaly2015, 59691.9480146)","(yeniturkiyei̇nsasindafelsefe, 5852.25)","(switzerlandneedsotrat, 1447.49075187)","(nashschristmasskit, 1447.30046921)"
2,2014-12-30,"(switzerlandneeds1dotrat, 4787810.32671)","(20thingsiwantfor2015, 1301800.37258)","(switzerlandneedsotratour, 469414.364522)","(neolieberswag, 17131.6748847)","(hollywoodmusicawards, 12773.2957978)"
3,2014-12-31,"(weneedotrainitaly2015, 825372.25)","(openrafahborder, 805625.184103)","(2015, 103923.947519)","(happynewyear, 35881.7318057)","(20thingsiwantfor2015, 29640.7442074)"
4,2015-01-01,"(ilovesuperjunior, 924482.25)","(ilovebts, 170625.650474)","(dmmedagi, 117581.106526)","(2015, 77109.7463785)","(happynewyear, 14038.5748095)"


In [349]:
overall_topics.to_csv("web/Analysis/overall_topics.csv", index=False)

In [28]:
l = []
for day in days:
    sub_df = df4[df4.day == day]
    candidate = df.loc[day][1]
    print(day)
    if not isinstance(candidate, str):
        l.append([day, None, None])
        continue

    pair = eval(df.loc[day][1])
    hashtag_name = pair[0]
    sub_df1 = sub_df['hashtags'].apply(lambda x: hashtag_name in x)
    a = sub_df[sub_df1]['placeId'].value_counts()
    
    l.append([day, a.index[0], a[0]])

2014-12-28
2014-12-29
2014-12-30
2014-12-31
2015-01-01
2015-01-02
2015-01-03
2015-01-04
2015-01-05
2015-01-06
2015-01-07
2015-01-08
2015-01-09
2015-01-10
2015-01-11
2015-01-12
2015-01-13
2015-01-14
2015-01-15
2015-01-16
2015-01-17
2015-01-18
2015-01-19
2015-01-20
2015-01-21
2015-01-22
2015-01-23
2015-01-24
2015-01-25
2015-01-26
2015-01-27
2015-01-28
2015-01-29
2015-01-30
2015-02-08
2015-02-09
2015-02-10
2015-02-11
2015-02-12
2015-02-13
2015-02-14
2015-02-15
2015-02-16
2015-02-17
2015-02-18
2015-02-19
2015-02-20
2015-02-21
2015-02-22
2015-02-23
2015-02-24
2015-02-25
2015-02-26
2015-02-27
2015-02-28
2015-03-01
2015-03-02
2015-03-03
2015-03-04
2015-03-05
2015-03-06
2015-03-07
2015-03-08
2015-03-09
2015-04-22
2015-04-23
2015-04-24
2015-04-25
2015-04-26
2015-04-27
2015-04-28
2015-04-29
2015-04-30
2015-05-01
2015-05-02
2015-05-03
2015-05-04
2015-05-05
2015-05-06
2015-05-07
2015-05-08
2015-05-09
2015-05-10
2015-05-11
2015-05-12
2015-05-13
2015-05-14
2015-05-15
2015-05-16
2015-05-17
2015-05-18

In [22]:
df = pd.read_csv("web/Analysis/overall_topics.csv")

In [23]:
df.set_index('date', inplace=True)

In [24]:
df['place'] = df['1'].apply(lambda x: '')

In [25]:
df['has_event'] = df['1'].apply(lambda x: True)

In [26]:
df4['hashtags'] = df4['hashtags'].apply(lambda x: [i.lower() for i in x])

In [27]:
df.head()

Unnamed: 0_level_0,1,2,3,4,5,place,has_event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-12-28,"('davidebraccialetti', 123261065.61367945)","('braccialettirossi', 1090408.4759098201)","('switzerlandneedsotrat', 3840.3278728453806)","('atamadafelsefeyiunutma', 1332.25)","('switzerland', 14.310981274630031)",,True
2014-12-29,"('switzerlandneedsotratour', 33731885.168040439)","('wewantotrainitaly2015', 59691.948014615453)","('yeniturkiyei̇nsasindafelsefe', 5852.25)","('switzerlandneedsotrat', 1447.4907518659709)","('nashschristmasskit', 1447.300469211865)",,True
2014-12-30,"('switzerlandneeds1dotrat', 4787810.3267061366)","('20thingsiwantfor2015', 1301800.3725766602)","('switzerlandneedsotratour', 469414.36452214647)","('neolieberswag', 17131.67488468088)","('hollywoodmusicawards', 12773.295797828736)",,True
2014-12-31,"('weneedotrainitaly2015', 825372.25)","('openrafahborder', 805625.18410325912)","('2015', 103923.94751935355)","('happynewyear', 35881.731805671974)","('20thingsiwantfor2015', 29640.744207429976)",,True
2015-01-01,"('ilovesuperjunior', 924482.25)","('ilovebts', 170625.6504743275)","('dmmedagi', 117581.10652550224)","('2015', 77109.746378452517)","('happynewyear', 14038.574809500005)",,True


In [30]:
topic_geo_infomation = pd.DataFrame(l, columns=['date', 'placeId', 'place_value'])

In [31]:
with open("data/twitter-swisscom/geo/geo_info_total.pickle", "rb") as h:
    geo = pickle.load(h)

In [32]:
geo.head()

1,placeId,placeLatitude,placeLongitude
0,000831c517105356,16.9866,103.94
1,000a93ad12003aaa,46.8911,7.51217
2,000b5d1aada9dcaa,46.9525,7.4459
3,0010c7694b04e371,47.4261,8.50569
4,0013241b7342de79,47.0462,8.30735


In [33]:
trend = pd.merge(df.reset_index(), topic_geo_infomation, on='date')

In [34]:
d = pd.merge(trend, geo, on='placeId', how='left').set_index('date')
d['has_event'] = d['1'].apply(lambda x: type(x) == str)

In [35]:
export = d[['1', 'has_event', 'place_value', 'placeLatitude', 'placeLongitude']]

In [36]:
export1 = export[export.has_event].copy()

In [37]:
export1.head()

Unnamed: 0_level_0,1,has_event,place_value,placeLatitude,placeLongitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-12-28,"('davidebraccialetti', 123261065.61367945)",True,192.0,46.3293,9.4055
2014-12-29,"('switzerlandneedsotratour', 33731885.168040439)",True,22.0,46.2504,8.27646
2014-12-30,"('switzerlandneeds1dotrat', 4787810.3267061366)",True,10.0,46.2219,6.18194
2014-12-31,"('weneedotrainitaly2015', 825372.25)",True,34.0,47.3774,8.53676
2015-01-01,"('ilovesuperjunior', 924482.25)",True,70.0,47.7218,7.28274


In [38]:
export1['topic'] = export1['1'].apply(lambda x: eval(x)[0])

In [39]:
export1['chi_squared'] = export1['1'].apply(lambda x: eval(x)[1])

In [42]:
export1[['place_value', 'placeLatitude', 'placeLongitude', 'topic', 'chi_squared']].to_csv("event_detection.csv")

# Topic Trend in different place

#### First create a dictionary of placeId to their all their hashtags

In [21]:
base1 = df1[['placeId', 'hashtags']].groupby('placeId').aggregate(lambda x: [i for l in x for i in l ])

In [22]:
base2 = df2[['placeId', 'hashtags']].groupby('placeId').aggregate(lambda x: [i for l in x for i in l ])

In [23]:
base3 = df3[['placeId', 'hashtags']].groupby('placeId').aggregate(lambda x: [i for l in x for i in l ])

In [24]:
twtter_count1 = df1.groupby('placeId').count()[['id']]
twtter_count2 = df2.groupby('placeId').count()[['id']]
twtter_count3 = df3.groupby('placeId').count()[['id']]

In [25]:
twtter_count1.columns = ['base_twitter_count']
twtter_count2.columns = ['base_twitter_count']
twtter_count3.columns = ['base_twitter_count']

In [26]:
twitter_count = pd.concat([twtter_count1, twtter_count2, twtter_count3], axis=1).fillna(0).apply(lambda x: np.sum(x), axis=1)

In [27]:
twitter_count = pd.DataFrame(twitter_count, columns=['base_twitter_count'])

#### Merge three part of data

In [28]:
c = pd.concat([base1, base2, base3], axis=1)

drop na

In [29]:
c.head()

Unnamed: 0,hashtags,hashtags.1,hashtags.2
000831c517105356,,,[]
000a93ad12003aaa,"[Zurich, drs3, wikipedia, gdi, fb, Basel, FCB,...","[Kreis11, Zürich, myclimate, Abflug, Amsterdam...","[GenevaMotorShow, PoloStMoritz2014, polo, resi..."
000b5d1aada9dcaa,"[jäten, creativemornings, SCB, HCFG, telezüri,...",,
0010c7694b04e371,[],,
0013241b7342de79,[],[],


In [30]:
c = c.applymap(lambda x: x if isinstance(x, list) else [])

In [31]:
base_dict_by_place = pd.DataFrame(c.apply(lambda x: [i for l in x for i in l], axis=1), columns=['hashtags'])

In [32]:
base_dict_by_place.head()

Unnamed: 0,hashtags
000831c517105356,[]
000a93ad12003aaa,"[Zurich, drs3, wikipedia, gdi, fb, Basel, FCB,..."
000b5d1aada9dcaa,"[jäten, creativemornings, SCB, HCFG, telezüri,..."
0010c7694b04e371,[]
0013241b7342de79,[]


In [33]:
base_dict_by_place['dict'] = base_dict_by_place['hashtags'].apply(lambda x: count_hash_tags(pd.Series([x])))

In [34]:
base_dict_by_place['count'] = base_dict_by_place['hashtags'].apply(lambda x: len(x))

In [35]:
base_dict_by_place = pd.concat([twitter_count, base_dict_by_place], axis=1)

In [36]:
base_dict_by_place.head()

Unnamed: 0,base_twitter_count,hashtags,dict,count
000831c517105356,1.0,[],{},0
000a93ad12003aaa,2580.0,"[Zurich, drs3, wikipedia, gdi, fb, Basel, FCB,...","{'g07f': 1, 'windowsphone81': 1, 'g07f7': 1, '...",779
000b5d1aada9dcaa,46.0,"[jäten, creativemornings, SCB, HCFG, telezüri,...","{'itunes': 1, 'flgi': 1, 'talktäglich': 1, 'ta...",32
0010c7694b04e371,2.0,[],{},0
0013241b7342de79,7.0,[],{},0
