# Analysis for Predictive Post

## Getting and Preparing Data

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

## Loading Data

In [109]:
time_series_cols = map(lambda c: "C"+str(c), range(1,110))
cols_names = ["native_id"] + time_series_cols

dtype = {}
for c in time_series_cols:
    dtype[c] = np.int32

time_series_retweets = pd.read_csv('data/time_series_popular_tweets_all_crop_100.csv', 
                                   dtype=dtype, header=None, names=cols_names)

In [110]:
nrows = len(time_series_retweets)

# Calculate mean values for each activity
mean_col = pd.Series(map(lambda i: (time_series_retweets.T)[i].values[1:].mean(), range(nrows)), index=range(nrows))

# Calculate median values for each activity
median_col = pd.Series(map(lambda i: np.median((time_series_retweets.T)[i].values[1:]), range(nrows)), index=range(nrows))

# Calculate min values for each activity
min_col = pd.Series(map(lambda i: (time_series_retweets.T)[i].values[1:].min(), range(nrows)), index=range(nrows))

# Calculate max values for each activity
max_col = pd.Series(map(lambda i: (time_series_retweets.T)[i].values[1:].max(), range(nrows)), index=range(nrows))

# Calculate total retweets for each activity
total_col = pd.Series(map(lambda i: sum((time_series_retweets.T)[i].values[1:]), range(nrows)), index=range(nrows))

time_series_retweets["mean"]   = mean_col
time_series_retweets["median"] = median_col
time_series_retweets["min"]    = min_col
time_series_retweets["max"]    = max_col
time_series_retweets["total"]  = total_col

time_series_retweets.head()

Unnamed: 0,native_id,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C105,C106,C107,C108,C109,mean,median,min,max,total
0,613126094502383616,1,3,1,8,6,13,7,7,5,...,4,2,1,2,1,4.027523,4,0,13,439
1,613078560258027520,1,3,7,9,5,8,4,7,11,...,5,5,4,5,1,4.633028,4,0,15,505
2,613051857972412417,1,3,5,6,6,12,2,12,10,...,1,2,3,3,2,5.082569,5,0,15,554
3,613405785415122944,1,1,0,0,1,0,0,0,2,...,15,19,14,7,8,4.862385,4,0,19,530
4,613713695844270081,2,10,11,12,16,21,15,11,9,...,3,2,1,5,2,4.311927,3,0,21,470


In [117]:
# Normalize values (v/mean)
for i in range(nrows):
    line = time_series_retweets.loc[i].values
    mu = line[-5]
    time_series_retweets.loc[i] = [line[0]] + map(lambda n: round(float(n)/mu,3), line[1:-5]) + list(line[-5:])
    
time_series_retweets.head()

Unnamed: 0,native_id,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C105,C106,C107,C108,C109,mean,median,min,max,total
0,6.131261e+17,0.248,0.745,0.248,1.986,1.49,3.228,1.738,1.738,1.241,...,0.993,0.497,0.248,0.497,0.248,4.027523,4,0,13,439
1,6.130786e+17,0.216,0.648,1.511,1.943,1.079,1.727,0.863,1.511,2.374,...,1.079,1.079,0.863,1.079,0.216,4.633028,4,0,15,505
2,6.130519e+17,0.197,0.59,0.984,1.181,1.181,2.361,0.394,2.361,1.968,...,0.197,0.394,0.59,0.59,0.394,5.082569,5,0,15,554
3,6.134058e+17,0.206,0.206,0.0,0.0,0.206,0.0,0.0,0.0,0.411,...,3.085,3.908,2.879,1.44,1.645,4.862385,4,0,19,530
4,6.137137e+17,0.464,2.319,2.551,2.783,3.711,4.87,3.479,2.551,2.087,...,0.696,0.464,0.232,1.16,0.464,4.311927,3,0,21,470


## Training the model

In [126]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

### Train with Linear Regression

In [130]:
def training_with_linear_regression(X, y):
    print "Training with Linear Regression"

    model = LinearRegression()

    scores = cross_val_score(model, X, y, scoring="mean_squared_error", cv=5)

    print "CV scores: " + str(scores)
    print "MSE: " + str(scores.mean())

### Train with KNeighborsRegressor

In [149]:
def training_with_knn_regressor(X, y):
    print "Training with KNN regressor"

    model = KNeighborsRegressor()

    k_range = range(1,11)
    param_dist = dict(n_neighbors=k_range)

    grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
    grid.fit(X, y)
    print grid.grid_scores_

### Train with SVR

In [139]:
def training_with_svr(X, y):
    print "Training with SVR"

    model = SVR()

    epsilon_range = [0.1, 0.2, 0.3]
    degree_range  = [1, 2, 3, 4, 5]
    gamma_range   = [0.0, 0.1, 0.2]

    param_dist = dict(epsilon=epsilon_range, degree=degree_range, gamma=gamma_range)

    grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
    grid.fit(X, y)
    print grid.grid_scores_

### Train with AdaBoostRegressor

In [156]:
def training_with_ada_boost_regressor(X, y):
    print "Training with AdaBoostRegressor"

    model = AdaBoostRegressor()

    n_estimators = range(25,70)

    param_dist = dict(n_estimators=n_estimators)

    grid = GridSearchCV(model, param_dist, cv=10, scoring="mean_squared_error")
    grid.fit(X, y)
    print grid.grid_scores_

## Training

In [141]:
X = np.array(time_series_retweets.drop(["native_id","total"], axis=1))
y = time_series_retweets.total.values

In [142]:
training_with_linear_regression(X, y)

Training with Linear Regression
CV scores: [ -7.69055574e-23  -1.75987121e-23  -5.94949880e-23  -2.17632057e-23
  -9.49859125e-22]
MSE: -2.25124317556e-22


In [150]:
training_with_knn_regressor(X, y)

Training with KNN regressor
[mean: -198320.23451, std: 449818.29551, params: {'n_neighbors': 1}, mean: -179563.43584, std: 340028.57614, params: {'n_neighbors': 2}, mean: -193960.54376, std: 317400.38264, params: {'n_neighbors': 3}, mean: -174640.32329, std: 207882.40001, params: {'n_neighbors': 4}, mean: -178513.77239, std: 159112.99719, params: {'n_neighbors': 5}, mean: -207251.21976, std: 152961.32548, params: {'n_neighbors': 6}, mean: -228326.62579, std: 170781.40815, params: {'n_neighbors': 7}, mean: -233700.30973, std: 174302.65439, params: {'n_neighbors': 8}, mean: -218900.29346, std: 163882.79975, params: {'n_neighbors': 9}, mean: -211174.45111, std: 158795.44844, params: {'n_neighbors': 10}]


In [144]:
training_with_svr(X, y)

Training with SVR
[mean: -1170329.15656, std: 451103.83419, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 1}, mean: -1180339.57175, std: 449108.45176, params: {'epsilon': 0.1, 'gamma': 0.1, 'degree': 1}, mean: -1180424.18535, std: 449135.01826, params: {'epsilon': 0.1, 'gamma': 0.2, 'degree': 1}, mean: -1170333.83545, std: 451102.88999, params: {'epsilon': 0.2, 'gamma': 0.0, 'degree': 1}, mean: -1180334.97000, std: 449109.33649, params: {'epsilon': 0.2, 'gamma': 0.1, 'degree': 1}, mean: -1180419.58401, std: 449135.90275, params: {'epsilon': 0.2, 'gamma': 0.2, 'degree': 1}, mean: -1170344.56368, std: 451100.11027, params: {'epsilon': 0.3, 'gamma': 0.0, 'degree': 1}, mean: -1180330.37028, std: 449110.22123, params: {'epsilon': 0.3, 'gamma': 0.1, 'degree': 1}, mean: -1180414.98471, std: 449136.78727, params: {'epsilon': 0.3, 'gamma': 0.2, 'degree': 1}, mean: -1170329.15656, std: 451103.83419, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 2}, mean: -1180339.57175, std: 449108.45176, 

In [145]:
training_with_ada_boost_regressor(X, y)

Training with AdaBoostRegressor
[mean: -15890.54470, std: 15262.15439, params: {'n_estimators': 25}, mean: -11432.45110, std: 9587.91179, params: {'n_estimators': 26}, mean: -12473.70644, std: 12613.93825, params: {'n_estimators': 27}, mean: -12447.04501, std: 13229.51147, params: {'n_estimators': 28}, mean: -10783.43291, std: 9851.97170, params: {'n_estimators': 29}, mean: -10505.78441, std: 9318.24537, params: {'n_estimators': 30}, mean: -11333.78772, std: 12281.45446, params: {'n_estimators': 31}, mean: -12646.31969, std: 11409.24819, params: {'n_estimators': 32}, mean: -11989.58817, std: 10505.75707, params: {'n_estimators': 33}, mean: -13546.27399, std: 12329.14532, params: {'n_estimators': 34}, mean: -11668.97840, std: 10875.12573, params: {'n_estimators': 35}, mean: -10773.68125, std: 9084.25871, params: {'n_estimators': 36}, mean: -11979.20184, std: 12313.32915, params: {'n_estimators': 37}, mean: -11362.15249, std: 11006.12938, params: {'n_estimators': 38}, mean: -10458.60426,

In [151]:
# Trying now without "min" and "max"
X = np.array(time_series_retweets.drop(["native_id","min","max","total"], axis=1))

In [152]:
training_with_linear_regression(X, y)

Training with Linear Regression
CV scores: [ -1.26590584e-23  -6.66252090e-24  -2.71851257e-24  -1.08812791e-23
  -1.13210297e-23]
MSE: -8.84848013005e-24


In [153]:
training_with_knn_regressor(X, y)

Training with KNN regressor
[mean: -41768.92920, std: 34008.47504, params: {'n_neighbors': 1}, mean: -41458.89270, std: 34956.70522, params: {'n_neighbors': 2}, mean: -37645.64307, std: 23096.14633, params: {'n_neighbors': 3}, mean: -34407.42423, std: 17201.35248, params: {'n_neighbors': 4}, mean: -34157.00478, std: 19483.42620, params: {'n_neighbors': 5}, mean: -34914.72861, std: 17554.31190, params: {'n_neighbors': 6}, mean: -40054.14791, std: 17954.85667, params: {'n_neighbors': 7}, mean: -40492.99274, std: 20144.30644, params: {'n_neighbors': 8}, mean: -42204.09773, std: 26009.64213, params: {'n_neighbors': 9}, mean: -42869.86221, std: 26647.44285, params: {'n_neighbors': 10}]


In [154]:
training_with_svr(X, y)

Training with SVR
[mean: -1154848.63467, std: 447882.05447, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 1}, mean: -1179873.59959, std: 449217.09352, params: {'epsilon': 0.1, 'gamma': 0.1, 'degree': 1}, mean: -1180365.44368, std: 449148.61866, params: {'epsilon': 0.1, 'gamma': 0.2, 'degree': 1}, mean: -1154840.93200, std: 447884.13235, params: {'epsilon': 0.2, 'gamma': 0.0, 'degree': 1}, mean: -1179869.00389, std: 449217.97659, params: {'epsilon': 0.2, 'gamma': 0.1, 'degree': 1}, mean: -1180360.84288, std: 449149.50275, params: {'epsilon': 0.2, 'gamma': 0.2, 'degree': 1}, mean: -1154830.23844, std: 447887.01895, params: {'epsilon': 0.3, 'gamma': 0.0, 'degree': 1}, mean: -1179864.41023, std: 449218.85967, params: {'epsilon': 0.3, 'gamma': 0.1, 'degree': 1}, mean: -1180356.24411, std: 449150.38685, params: {'epsilon': 0.3, 'gamma': 0.2, 'degree': 1}, mean: -1154848.63467, std: 447882.05447, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 2}, mean: -1179873.59959, std: 449217.09352, 

In [157]:
training_with_ada_boost_regressor(X, y)

Training with AdaBoostRegressor
[mean: -13039.35726, std: 9231.47412, params: {'n_estimators': 25}, mean: -12986.41342, std: 13074.69283, params: {'n_estimators': 26}, mean: -14523.80270, std: 16628.59255, params: {'n_estimators': 27}, mean: -11854.64944, std: 10075.30154, params: {'n_estimators': 28}, mean: -11830.50700, std: 9839.16201, params: {'n_estimators': 29}, mean: -10833.28104, std: 9975.99821, params: {'n_estimators': 30}, mean: -13609.47290, std: 14351.77180, params: {'n_estimators': 31}, mean: -13504.63625, std: 11205.11707, params: {'n_estimators': 32}, mean: -12231.07028, std: 10163.54384, params: {'n_estimators': 33}, mean: -12712.38480, std: 11259.57334, params: {'n_estimators': 34}, mean: -13317.12668, std: 12259.28447, params: {'n_estimators': 35}, mean: -10111.41985, std: 9462.64567, params: {'n_estimators': 36}, mean: -12634.01979, std: 14041.41893, params: {'n_estimators': 37}, mean: -11596.18381, std: 11850.15624, params: {'n_estimators': 38}, mean: -11374.11252,

In [158]:
# Trying now without "min", "max" and "median"
X = np.array(time_series_retweets.drop(["native_id","min","max","median","total"], axis=1))

In [159]:
training_with_linear_regression(X, y)

Training with Linear Regression
CV scores: [ -1.01766756e-23  -4.52576331e-24  -4.55080379e-24  -2.25141401e-23
  -7.80353892e-24]
MSE: -9.91418433043e-24


In [160]:
training_with_knn_regressor(X, y)

Training with KNN regressor
[mean: -38952.84956, std: 19438.69730, params: {'n_neighbors': 1}, mean: -36216.86836, std: 15400.49632, params: {'n_neighbors': 2}, mean: -38352.67945, std: 15415.29173, params: {'n_neighbors': 3}, mean: -37830.94580, std: 14614.43543, params: {'n_neighbors': 4}, mean: -36828.96619, std: 16045.50713, params: {'n_neighbors': 5}, mean: -37271.83800, std: 17749.24075, params: {'n_neighbors': 6}, mean: -38499.50948, std: 18252.41280, params: {'n_neighbors': 7}, mean: -40978.90321, std: 17689.41468, params: {'n_neighbors': 8}, mean: -40977.77685, std: 19438.46690, params: {'n_neighbors': 9}, mean: -42231.81619, std: 23023.85029, params: {'n_neighbors': 10}]


In [161]:
training_with_svr(X, y)

Training with SVR
[mean: -1155095.16256, std: 447952.00079, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 1}, mean: -1179569.88035, std: 449255.03408, params: {'epsilon': 0.1, 'gamma': 0.1, 'degree': 1}, mean: -1180323.19835, std: 449160.05020, params: {'epsilon': 0.1, 'gamma': 0.2, 'degree': 1}, mean: -1155084.52180, std: 447951.56136, params: {'epsilon': 0.2, 'gamma': 0.0, 'degree': 1}, mean: -1179565.28512, std: 449255.91573, params: {'epsilon': 0.2, 'gamma': 0.1, 'degree': 1}, mean: -1180318.59778, std: 449160.93402, params: {'epsilon': 0.2, 'gamma': 0.2, 'degree': 1}, mean: -1155073.88308, std: 447951.12420, params: {'epsilon': 0.3, 'gamma': 0.0, 'degree': 1}, mean: -1179560.69192, std: 449256.79740, params: {'epsilon': 0.3, 'gamma': 0.1, 'degree': 1}, mean: -1180313.99925, std: 449161.81786, params: {'epsilon': 0.3, 'gamma': 0.2, 'degree': 1}, mean: -1155095.16256, std: 447952.00079, params: {'epsilon': 0.1, 'gamma': 0.0, 'degree': 2}, mean: -1179569.88035, std: 449255.03408, 

In [162]:
training_with_ada_boost_regressor(X, y)

Training with AdaBoostRegressor
[mean: -12154.32749, std: 10041.37466, params: {'n_estimators': 25}, mean: -14224.21564, std: 14670.27547, params: {'n_estimators': 26}, mean: -12840.16042, std: 10359.11685, params: {'n_estimators': 27}, mean: -11278.84744, std: 9978.28275, params: {'n_estimators': 28}, mean: -12718.87654, std: 12069.25525, params: {'n_estimators': 29}, mean: -13179.46784, std: 15251.50027, params: {'n_estimators': 30}, mean: -11559.52690, std: 14138.37800, params: {'n_estimators': 31}, mean: -12668.60127, std: 10333.27827, params: {'n_estimators': 32}, mean: -14032.17118, std: 15088.01329, params: {'n_estimators': 33}, mean: -12496.77394, std: 15128.99485, params: {'n_estimators': 34}, mean: -12390.81398, std: 11912.70887, params: {'n_estimators': 35}, mean: -10934.74925, std: 11200.54499, params: {'n_estimators': 36}, mean: -11170.88379, std: 9703.12883, params: {'n_estimators': 37}, mean: -12405.63779, std: 13548.27085, params: {'n_estimators': 38}, mean: -11886.4624