# Analysis for Predictive Post

## Getting and Preparing Data

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt 
import math

## Loading Data

In [55]:
time_series_cols = map(lambda c: "C"+str(c), range(1,110))
cols_names = ["native_id"] + time_series_cols

dtype = {}
for c in time_series_cols:
    dtype[c] = np.int32

time_series_retweets = pd.read_csv('data/time_series_popular_tweets_all_crop_100.csv', 
                                   dtype=dtype, header=None, names=cols_names)

In [56]:
nrows = len(time_series_retweets)

# Calculate mean values for each activity
mean_col = pd.Series(map(lambda i: (time_series_retweets.T)[i].values[1:].mean(), range(nrows)), index=range(nrows))

# Calculate total retweets for each activity
total_col = pd.Series(map(lambda i: sum((time_series_retweets.T)[i].values[1:]), range(nrows)), index=range(nrows))

time_series_retweets["mean"]  = mean_col
time_series_retweets["total"] = total_col

time_series_retweets.head()

Unnamed: 0,native_id,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C102,C103,C104,C105,C106,C107,C108,C109,mean,total
0,613126094502383616,1,3,1,8,6,13,7,7,5,...,2,3,0,4,2,1,2,1,4.027523,439
1,613078560258027520,1,3,7,9,5,8,4,7,11,...,5,5,1,5,5,4,5,1,4.633028,505
2,613051857972412417,1,3,5,6,6,12,2,12,10,...,5,1,5,1,2,3,3,2,5.082569,554
3,613405785415122944,1,1,0,0,1,0,0,0,2,...,12,11,13,15,19,14,7,8,4.862385,530
4,613713695844270081,2,10,11,12,16,21,15,11,9,...,1,2,3,3,2,1,5,2,4.311927,470


In [57]:
# Normalize values (v/mean)
for i in range(nrows):
    line = time_series_retweets.loc[i].values
    mu = line[-2]
    tt = line[-1]
    time_series_retweets.loc[i] = [line[0]] + map(lambda n: round(float(n)/mu,3), line[1:-2]) + [mu,tt]
    
time_series_retweets.head()

Unnamed: 0,native_id,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C102,C103,C104,C105,C106,C107,C108,C109,mean,total
0,6.131261e+17,0.248,0.745,0.248,1.986,1.49,3.228,1.738,1.738,1.241,...,0.497,0.745,0.0,0.993,0.497,0.248,0.497,0.248,4.027523,439
1,6.130786e+17,0.216,0.648,1.511,1.943,1.079,1.727,0.863,1.511,2.374,...,1.079,1.079,0.216,1.079,1.079,0.863,1.079,0.216,4.633028,505
2,6.130519e+17,0.197,0.59,0.984,1.181,1.181,2.361,0.394,2.361,1.968,...,0.984,0.197,0.984,0.197,0.394,0.59,0.59,0.394,5.082569,554
3,6.134058e+17,0.206,0.206,0.0,0.0,0.206,0.0,0.0,0.0,0.411,...,2.468,2.262,2.674,3.085,3.908,2.879,1.44,1.645,4.862385,530
4,6.137137e+17,0.464,2.319,2.551,2.783,3.711,4.87,3.479,2.551,2.087,...,0.232,0.464,0.696,0.696,0.464,0.232,1.16,0.464,4.311927,470


## Prepare for training

In [83]:
X = np.array(time_series_retweets.drop(["native_id","C52"], axis=1))
y = time_series_retweets.C52.values

# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

## Training the model

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn import metrics

### Train with Linear Regression

In [85]:
print "Training with Linear Regression"

model = LinearRegression()

scores = cross_val_score(model, X, y, scoring="mean_squared_error", cv=5)

print "CV scores: " + str(scores)
print "MSE: " + str(scores.mean())

Training with Linear Regression
CV scores: [-60.76069761 -19.9617242  -26.24870192 -63.17446096 -13.59748703]
MSE: -36.7486143418


### Train with KNeighborsRegressor

In [86]:
print "Training with KNN regressor"

model = KNeighborsRegressor()

k_range = range(1,11)
param_dist = dict(n_neighbors=k_range)

grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
grid.fit(X, y)
grid.grid_scores_

Training with KNN regressor


[mean: -32.04425, std: 21.98382, params: {'n_neighbors': 1},
 mean: -29.17810, std: 16.59492, params: {'n_neighbors': 2},
 mean: -26.52409, std: 14.45605, params: {'n_neighbors': 3},
 mean: -27.59015, std: 15.65082, params: {'n_neighbors': 4},
 mean: -26.87575, std: 16.93989, params: {'n_neighbors': 5},
 mean: -29.06723, std: 20.34946, params: {'n_neighbors': 6},
 mean: -29.90275, std: 22.01556, params: {'n_neighbors': 7},
 mean: -31.16074, std: 22.40635, params: {'n_neighbors': 8},
 mean: -30.73653, std: 20.89489, params: {'n_neighbors': 9},
 mean: -31.23279, std: 21.74715, params: {'n_neighbors': 10}]

### Train with SVR

In [87]:
print "Training with SVR"

model = SVR()

epsilon_range = [0.1, 0.2, 0.3]
degree_range  = [1, 2, 3, 4, 5]
gamma_range   = [0.0, 0.1, 0.2]

param_dist = dict(epsilon=epsilon_range, degree=degree_range, gamma=gamma_range)

grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
grid.fit(X, y)
grid.grid_scores_

Training with SVR


[mean: -71.04571, std: 37.42715, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 1},
 mean: -73.95263, std: 37.09928, params: {'epsilon': 0.1, 'gamma': 0.1, 'degree': 1},
 mean: -74.46928, std: 36.96297, params: {'epsilon': 0.1, 'gamma': 0.2, 'degree': 1},
 mean: -71.01866, std: 37.43880, params: {'epsilon': 0.2, 'gamma': 0.0, 'degree': 1},
 mean: -73.88824, std: 37.12427, params: {'epsilon': 0.2, 'gamma': 0.1, 'degree': 1},
 mean: -74.40376, std: 36.98997, params: {'epsilon': 0.2, 'gamma': 0.2, 'degree': 1},
 mean: -70.96863, std: 37.44361, params: {'epsilon': 0.3, 'gamma': 0.0, 'degree': 1},
 mean: -73.81701, std: 37.12309, params: {'epsilon': 0.3, 'gamma': 0.1, 'degree': 1},
 mean: -74.33231, std: 36.99027, params: {'epsilon': 0.3, 'gamma': 0.2, 'degree': 1},
 mean: -71.04571, std: 37.42715, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 2},
 mean: -73.95263, std: 37.09928, params: {'epsilon': 0.1, 'gamma': 0.1, 'degree': 2},
 mean: -74.46928, std: 36.96297, params: {'epsilon': 0

### Train with AdaBoostRegressor

In [88]:
print "Training with AdaBoostRegressor"

model = AdaBoostRegressor()

n_estimators = range(25,70)

param_dist = dict(n_estimators=n_estimators)

grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
grid.fit(X, y)
grid.grid_scores_

Training with AdaBoostRegressor


[mean: -11.08758, std: 5.35349, params: {'n_estimators': 25},
 mean: -11.06453, std: 4.29922, params: {'n_estimators': 26},
 mean: -11.11246, std: 4.14822, params: {'n_estimators': 27},
 mean: -11.05079, std: 5.16545, params: {'n_estimators': 28},
 mean: -11.65714, std: 4.65338, params: {'n_estimators': 29},
 mean: -11.26937, std: 6.39830, params: {'n_estimators': 30},
 mean: -11.44757, std: 5.28737, params: {'n_estimators': 31},
 mean: -11.16917, std: 5.23394, params: {'n_estimators': 32},
 mean: -10.97287, std: 3.77929, params: {'n_estimators': 33},
 mean: -11.01235, std: 4.82901, params: {'n_estimators': 34},
 mean: -11.37493, std: 4.07290, params: {'n_estimators': 35},
 mean: -11.53272, std: 4.94420, params: {'n_estimators': 36},
 mean: -11.37836, std: 5.00200, params: {'n_estimators': 37},
 mean: -11.16357, std: 3.62453, params: {'n_estimators': 38},
 mean: -10.61961, std: 5.01775, params: {'n_estimators': 39},
 mean: -11.10091, std: 4.04944, params: {'n_estimators': 40},
 mean: -