# Analysis for calculating score for Alerts


## Getting Data

This section loads sample data for 2015-05-21 records of activities, rules and actors.

In [46]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
import math
import time
from datetime import datetime
from time import gmtime, strftime

### Activities

In [3]:
activities_cols_names = ["id","body", "country", "country_code", "place_type",
                       "sub_region", "actor_id", "source", "share_count",
                       "in_reply_to_native_id", "created_at", "updated_at",
                       "klout", "native_id", "verb", "latitude", "longitude",
                       "sharing_activity_native_id", "region", "favorites_count",
                       "replies_count", "in_reply_to_screen_name", "link"]

In [4]:
activities_relevant_cols = ["actor_id", "source", "share_count",
                           "in_reply_to_native_id", "klout", "verb", "favorites_count",
                           "replies_count"]

In [7]:
activities = pd.read_csv('../../s3/2015-05-21-01-00-00-activities.csv', 
                         header=None, parse_dates=True,
                         names=activities_cols_names, index_col="id")

In [8]:
activities = activities[activities_relevant_cols] 

In [9]:
activities.head()

Unnamed: 0_level_0,actor_id,source,share_count,in_reply_to_native_id,klout,verb,favorites_count,replies_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
443640355408254904,2419819735,twitter,0,,48,post,0,0
443640355408254905,2405587887,twitter,0,,54,post,0,0
443640355408254907,2425857238,twitter,0,,31,post,0,0
443640355408254908,2422152815,twitter,9,,19,share,14,0
443640355408254909,2406991214,twitter,0,,35,post,0,0


In [10]:
activities.describe()

Unnamed: 0,share_count,klout,favorites_count
count,1280768.0,1255925.0,1280865.0
mean,1878510000000.0,48364120000000.0,3338.222079
std,1062963000000000.0,5392960000000000.0,13146.340361
min,0.0,10.0,0.0
25%,1.0,29.0,0.0
50%,60.0,40.0,54.0
75%,709.0,45.0,782.0
max,6.015506e+17,6.015513e+17,462900.0


### Association Activities-Rules

In [11]:
activities_rules_cols_names = ["id", "rule_id", "activity_id", "created_at", "updated_at", "ignored"]

In [12]:
activities_rules = pd.read_csv('../../s3/2015-05-21-01-00-00-activities-rules.csv', 
                               parse_dates=True, names=activities_rules_cols_names, index_col="id")

In [13]:
activities_rules_relevant_cols = ["rule_id", "activity_id", "ignored"]

In [14]:
activities_rules = activities_rules[activities_rules_relevant_cols]

In [15]:
activities_rules.head()

Unnamed: 0_level_0,rule_id,activity_id,ignored
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
108642970,5772,4.436404e+17,0
108642971,5052,4.436404e+17,0
108642972,5428,4.436404e+17,0
108642973,1506,4.436404e+17,0
108642974,5077,4.436404e+17,0


### Rules

In [16]:
rules = pd.read_csv('../../s3/rule.csv', index_col="id")

In [17]:
rules_relevant_cols = ["business_id","segment","volume", "category", "source", "type", "direct"]

In [18]:
rules = rules[rules_relevant_cols]

In [19]:
rules.head()

Unnamed: 0_level_0,business_id,segment,volume,category,source,type,direct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,144,1,0,2,twitter,TwitterRule,False
2,144,1,0,2,twitter,TwitterRule,False
3,144,1,0,2,twitter,TwitterRule,False
4,144,1,0,2,twitter,TwitterRule,False
5,144,1,0,2,twitter,TwitterRule,False


In [20]:
rules.describe()

Unnamed: 0,business_id,segment,volume,category,direct
count,4786.0,4786.0,4464.0,4786.0,4786
mean,314.314668,0.376097,232681.2,3.243627,0.0215211
std,125.258811,0.484455,8564815.0,1.819087,0.1451287
min,1.0,0.0,0.0,0.0,False
25%,262.0,0.0,16.0,2.0,0
50%,354.0,0.0,297.0,3.0,0
75%,412.0,1.0,3140.25,5.0,0
max,460.0,1.0,475071400.0,6.0,True


### Actors

In [21]:
actors = pd.read_csv('../../s3/actors.csv', index_col="id")

In [22]:
actors.head()

Unnamed: 0_level_0,lang,favourites_count,statuses_count,friends_count,followers_count,listed_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,en,,16636,1450,2984845,24425
767,en,24198.0,69163,2983,90600,4884
785,en,,23948,846,14979,631
1952,en,94.0,41700,998,4043,226
2172,en,76.0,53560,2033,582423,12127


In [23]:
actors.describe()

Unnamed: 0,favourites_count,statuses_count,friends_count,followers_count,listed_count
count,72205.0,734833.0,734834.0,734834.0,734611.0
mean,2063.706627,17198.639341,989.087618,3778.709121,32.09336
std,6821.23179,34693.571539,6001.745585,155812.787824,1135.065026
min,0.0,0.0,-438.0,0.0,0.0
25%,36.0,1385.0,173.0,133.0,0.0
50%,343.0,5884.0,358.0,343.0,2.0
75%,1703.0,18786.0,779.0,853.0,7.0
max,492244.0,2051862.0,1593446.0,64199466.0,821163.0


### Alerts

In [24]:
alerts = pd.read_csv('../../s3/alerts.csv', index_col="id")

In [25]:
alerts.head()

Unnamed: 0_level_0,business_id,options,created_at,updated_at,alert_type_id,type,rule_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
173559,397,"{""klout_topics""=>""Baseball, College Baseball, ...",2015-05-21 00:00:15 -0400,2015-05-21 00:00:15 -0400,129,,3454
173560,435,"{""klout_topics""=>""Sacramento, Twitter, San Jos...",2015-05-21 00:04:30 -0400,2015-05-21 00:04:30 -0400,129,,4561
173561,439,"{""klout_topics""=>""Architecture, Interior Desig...",2015-05-21 00:09:26 -0400,2015-05-21 00:09:26 -0400,129,,5183
173562,421,"{""hashtags""=>[""usocwithbdpc""], ""topic""=>""\""son...",2015-05-21 00:12:43 -0400,2015-05-21 00:12:43 -0400,118,HashtagAlert,4268
173563,369,"{""mean""=>""28.6667"", ""topic""=>""@pgatour"", ""vali...",2015-05-21 00:12:44 -0400,2015-05-21 00:12:44 -0400,142,PopularTopicAlert,2872


In [26]:
alerts.describe()

Unnamed: 0,business_id,alert_type_id,rule_id
count,620.0,620.0,620.0
mean,307.483871,133.143548,3257.156452
std,155.33353,6.910251,1998.946054
min,1.0,113.0,20.0
25%,224.0,129.0,1450.75
50%,369.0,129.0,3616.0
75%,443.0,141.0,5269.0
max,453.0,146.0,5897.0


### Activities for URLs

In [102]:
activities_urls = pd.read_csv('../../s3/activities_urls.csv', index_col="id")
activities_urls = activities_urls[activities_relevant_cols +["created_at","activities_count","expanded_url"]]
activities_urls = activities_urls.sort("activities_count", ascending=False)
activities_urls['created_at'] = pd.to_datetime(activities_urls['created_at'], format="%Y-%m-%d %H:%M:%S")

In [103]:
activities_urls.head()

Unnamed: 0_level_0,actor_id,source,share_count,in_reply_to_native_id,klout,verb,favorites_count,replies_count,created_at,activities_count,expanded_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
443640355408832882,2424761055,twitter,671,,17,share,468,0,2015-05-21 09:43:58,701,http://twitter.com/footy_jokes/status/60133160...
443640355408700115,2424174321,twitter,400,,25,share,246,0,2015-05-21 07:02:32,701,http://twitter.com/footy_jokes/status/60133160...
443640355408701789,2424330443,twitter,407,,31,share,251,0,2015-05-21 07:04:55,701,http://twitter.com/footy_jokes/status/60133160...
443640355408701608,2408455207,twitter,406,,34,share,251,0,2015-05-21 07:04:38,701,http://twitter.com/footy_jokes/status/60133160...
443640355408701432,2426154688,twitter,405,,29,share,249,0,2015-05-21 07:04:24,701,http://twitter.com/footy_jokes/status/60133160...


In [104]:
activities_urls.describe()

Unnamed: 0,actor_id,share_count,in_reply_to_native_id,klout,favorites_count,replies_count,activities_count
count,1909.0,1909.0,0.0,1893.0,1909.0,1909.0,1909.0
mean,2178678000.0,259.558931,,34.38299,189.161865,0.02043,432.473023
std,670626800.0,228.450609,,11.030357,166.03546,0.523155,233.626166
min,1819261.0,0.0,,10.0,0.0,0.0,12.0
25%,2407298000.0,79.0,,27.0,35.0,0.0,194.0
50%,2415558000.0,191.0,,34.0,159.0,0.0,446.0
75%,2423687000.0,390.0,,42.0,311.0,0.0,701.0
max,2426304000.0,975.0,,90.0,615.0,21.0,701.0


## Computing Scores

This section implements the functions for computing and the scores and performs some tests for subsequent validation.

### Alert type - Influencer


- Legend:
    - k = klout
    - f = followers
    - l = listed count
    - v = verb
    - kw = klout weight
    - fw = followers count weight
    - lw = listed count weight

- Values:
    - kw = 30
    - fw = 40
    - lw = 30
    - v = (tweet=1, retweet=0.9)

**influencer_score** = (2-v) \* kw \* (k^2) / 10000 +
                        v \* fw \* log(f) / 20 +
                        v \* lw \* log(l) / 15


In [32]:
def generate_influencer_score(activity_id, k=None, f=None, l=None, verb="post"):
    # Weights for klout, followers and lists
    kw = 30.0
    fw = 40.0
    lw = 30.0
    
    # Default behavior when activity_id is provided
    if activity_id is not None:
        activity = activities.loc[str(activity_id)]
        actor = actors.loc[int(activity.actor_id)]
        if not activity.klout.is_integer():
            k = 0.0
    
        # Obtain values from activity unless parameterized
        if k is None: k = float(activity.klout)
        if f is None: f = float(actor.followers_count + 1.0)
        if l is None: l = float(actor.listed_count + 1.0)
    
    # Coefficient for tweet(verb=post) or retweet(verb=share)
    v = 1.0 if verb=="post" else 0.9
    
    # Score calculation and return
    return int((2-v)*kw * float(k ** 2) / 10000 + \
                v * fw * math.log(f) / 20 + \
                v * lw * math.log(l) / 15)

In [33]:
for index, row in activities.head(30).iterrows():
    print "Generated score for activity {0}: {1}".format(index,generate_influencer_score(index))

Generated score for activity 443640355408254904: 29
Generated score for activity 443640355408254905: 22
Generated score for activity 443640355408254907: 20
Generated score for activity 443640355408254908: 7
Generated score for activity 443640355408254909: 18
Generated score for activity 443640355408254910: 14
Generated score for activity 443640355408254912: 28
Generated score for activity 443640355408254913: 7
Generated score for activity 443640355408254914: 30
Generated score for activity 443640355408254916: 23
Generated score for activity 443640355408254917: 12
Generated score for activity 443640355408254920: 30
Generated score for activity 443640355408254922: 16
Generated score for activity 443640355408254923: 22
Generated score for activity 443640355408254926: 12
Generated score for activity 443640355408254927: 47
Generated score for activity 443640355408254928: 16
Generated score for activity 443640355408254934: 57
Generated score for activity 443640355408254915: 42
Generated scor

Now testing with predefined values for evaluating limits:

In [34]:
generate_influencer_score(activity_id=None, k=99, f=50000000, l=50000000, verb='post')

100

In [35]:
generate_influencer_score(activity_id=None, k=90, f=50000000, l=50000000, verb='post')

95

In [36]:
generate_influencer_score(activity_id=None, k=99, f=50000000, l=50000000, verb='share')

96

In [37]:
generate_influencer_score(activity_id=None, k=90, f=50000000, l=50000000, verb='share')

90

In [38]:
generate_influencer_score(activity_id=None, k=70, f=5000000, l=5000000, verb='post')

76

In [39]:
generate_influencer_score(activity_id=None, k=70, f=5000000, l=5000000, verb='share')

71

In [40]:
generate_influencer_score(activity_id=None, k=50, f=5000000, l=5000000, verb='post')

69

In [41]:
generate_influencer_score(activity_id=None, k=50, f=5000000, l=5000000, verb='share')

63

In [42]:
generate_influencer_score(activity_id=None, k=50, f=1000000, l=1000000, verb='post')

62

In [43]:
generate_influencer_score(activity_id=None, k=50, f=1000000, l=1000000, verb='share')

57

### Alert type - Trends
* Reach of tweets
* segment: marketing (categories: range from 1 to 10) vs business(15% more important than Marketing)

**trend_score** = reach + segment

* Notes: Think about Potential, Virality




In [108]:
url_param = "http://twitter.com/footy_jokes/status/601331607371390976/photo/1"

time_list = activities_urls[activities_urls["expanded_url"]==url_param]["created_at"].tolist()

# print time_list

# time_list = map(lambda s: time.strptime(s, "%Y-%m-%d %H:%M:%S"), time_list)

numeric_time_list = map(lambda t: float(time.strftime("%H", t)) + float(time.strftime("%M", t))/60, time_list)
hour_list = map(lambda t: int(time.strftime("%H", t)), time_list)

plt.hist(numeric_time_list, 100)
mpl.rc("figure", figsize=(16, 4))
plt.show()
# sns.set_palette("hls")
# mpl.rc("figure", figsize=(8, 4))
# sns.distplot(hour_list);

TypeError: argument must be 9-item sequence, not Timestamp

### Alert type - Popular Tweet
* Engagement of tweet
* Potential of engagement

**popular_score** = engagement + potential


In [None]:
# TODO