# Analysis for calculating score for Alerts

**y_influencer** = 3\*((activity.klout^2)/10000) + 4\*(log(actor.followers_count)/20) + log(actor.listed_count) + verb(post=1, share=0.7)

**y_trend** = reach + segment

**y_popular** = engagement + potential

### Tipo de alerta - Influencer
* klout (exp)
* followers (log)
* listed count (log)
* Rule volume (inverse weight for previous variables)
* verb: retweet(0.7) tweet (1)
* (+) previous_alert: já houve outro alerta sobre este mesmo influencer
* (++) relev_direct_reply + relev_3rd_reply: relevância para reply para a marca ou de outra pessoa citando a marca

### Tipo de alerta - Trends
* Reach / alcance de tweets
* segmento: marketing (categorias: receber somente número de 1 a 10) vs business(15% mais importante que Marketing)

### Tipo de alerta - Popular Tweet
* Engajamento que o tweet teve
* Potencial de engajamento


## Getting Data

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime

### Activities

In [6]:
activities_cols_names = ["id","body", "country", "country_code", "place_type",
                       "sub_region", "actor_id", "source", "share_count",
                       "in_reply_to_native_id", "created_at", "updated_at",
                       "klout", "native_id", "verb", "latitude", "longitude",
                       "sharing_activity_native_id", "region", "favorites_count",
                       "replies_count", "in_reply_to_screen_name", "link"]

In [8]:
activities_relevant_cols = ["actor_id", "source", "share_count",
                           "in_reply_to_native_id", "klout", "verb", "favorites_count",
                           "replies_count"]

In [11]:
activities = pd.read_csv('../../s3/2015-05-21-01-00-00-activities.csv', 
                         header=None, parse_dates=True,
                         names=activities_cols_names, index_col="id")

In [12]:
activities = activities[activities_relevant_cols] 

In [63]:
activities.head()

Unnamed: 0_level_0,actor_id,source,share_count,in_reply_to_native_id,klout,verb,favorites_count,replies_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
443640355408254904,2419819735,twitter,0,,48,post,0,0
443640355408254905,2405587887,twitter,0,,54,post,0,0
443640355408254907,2425857238,twitter,0,,31,post,0,0
443640355408254908,2422152815,twitter,9,,19,share,14,0
443640355408254909,2406991214,twitter,0,,35,post,0,0


In [14]:
activities.describe()

Unnamed: 0,share_count,klout,favorites_count
count,1280768.0,1255925.0,1280865.0
mean,1878510000000.0,48364120000000.0,3338.222079
std,1062963000000000.0,5392960000000000.0,13146.340361
min,0.0,10.0,0.0
25%,1.0,29.0,0.0
50%,60.0,40.0,54.0
75%,709.0,45.0,782.0
max,6.015506e+17,6.015513e+17,462900.0


### Association Activities-Rules

In [15]:
activities_rules_cols_names = ["id", "rule_id", "activity_id", "created_at", "updated_at", "ignored"]

In [17]:
activities_rules = pd.read_csv('../../s3/2015-05-21-01-00-00-activities-rules.csv', 
                               parse_dates=True, names=activities_rules_cols_names, index_col="id")

In [18]:
activities_rules_relevant_cols = ["rule_id", "activity_id", "ignored"]

In [19]:
activities_rules = activities_rules[activities_rules_relevant_cols]

In [20]:
activities_rules.head()

Unnamed: 0_level_0,rule_id,activity_id,ignored
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
108642970,5772,4.436404e+17,0
108642971,5052,4.436404e+17,0
108642972,5428,4.436404e+17,0
108642973,1506,4.436404e+17,0
108642974,5077,4.436404e+17,0


### Rules

In [22]:
rules = pd.read_csv('../../s3/rule.csv', index_col="id")

In [23]:
rules_relevant_cols = ["business_id","segment","volume", "category", "source", "type", "direct"]

In [24]:
rules = rules[rules_relevant_cols]

In [25]:
rules.head()

Unnamed: 0_level_0,business_id,segment,volume,category,source,type,direct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,144,1,0,2,twitter,TwitterRule,False
2,144,1,0,2,twitter,TwitterRule,False
3,144,1,0,2,twitter,TwitterRule,False
4,144,1,0,2,twitter,TwitterRule,False
5,144,1,0,2,twitter,TwitterRule,False


In [26]:
rules.describe()

Unnamed: 0,business_id,segment,volume,category,direct
count,4786.0,4786.0,4464.0,4786.0,4786
mean,314.314668,0.376097,232681.2,3.243627,0.0215211
std,125.258811,0.484455,8564815.0,1.819087,0.1451287
min,1.0,0.0,0.0,0.0,False
25%,262.0,0.0,16.0,2.0,0
50%,354.0,0.0,297.0,3.0,0
75%,412.0,1.0,3140.25,5.0,0
max,460.0,1.0,475071400.0,6.0,True


### Actors

In [80]:
actors = pd.read_csv('../../s3/actors.csv', index_col="id")

In [81]:
actors.head()

Unnamed: 0_level_0,lang,favourites_count,statuses_count,friends_count,followers_count,listed_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2419819735,en,,16139,761,1065,71
2405587887,en,,3043,1368,140,5
2425857238,en,,15668,7,545,12
2422152815,en,,171,57,30,0
2406991214,en,,1341,306,190,7


In [82]:
actors.describe()

Unnamed: 0,favourites_count,statuses_count,friends_count,followers_count,listed_count
count,146204.0,1278952.0,1278953.0,1278953.0,1278650.0
mean,2288.747558,29485.832285,1787.903539,17574.086707,147.851316
std,8553.651691,63277.951238,11217.130245,358911.933053,2558.850142
min,0.0,0.0,-438.0,0.0,0.0
25%,15.0,2208.0,170.0,157.0,1.0
50%,225.0,9225.5,396.0,426.0,3.0
75%,1488.0,29864.0,1012.0,1242.0,14.0
max,492244.0,2051862.0,1593447.0,64199466.0,821163.0
