In [1]:
import json

import numpy as np
import math
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as seabornInstance 

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

# Load DataFrames and lists

In [16]:

def loadDataframe(fileName, numbRows):
    # load .csv data into a dataframe (df) and make sure only important rows stay
    df = pd.read_csv(fileName)
    df = df[:numbRows]
    #remove column white space
    df.columns = df.columns.str.replace(' ', '')
    #drop url column (it doesn't matter for us)
    if 'url' in df.columns:
        df.drop('url', axis=1, inplace=True)
    # set the dataframe index to the id column
    if 'id' in df.columns:
        df = df.set_index('id')
    # Make sure all data is float
    df = df.astype(float)
    
    return df

# Retrieve Dataframes
train_df = loadDataframe('train.csv', 5000)
predictions_df = loadDataframe('predictions_df.csv', 1000)
regressions_df = pd.read_csv('regressions_df.csv', index_col = 'model')
validation_df = loadDataframe('validation.csv', 1000)
validation_df.drop('shares', axis=1, inplace=True)

# Retrieve regression json
with open('regressions.json') as json_file:
    regressions = json.load(json_file)

# set dataframes to display all columns
pd.options.display.max_columns = None

train_df

Unnamed: 0_level_0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1.0,198.0,6.0,47.0,0.914894,1.0,0.964286,1.0,1.0,0.0,2.0,4.744681,4.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,321.0,121.000000,100300.0,843300.0,415450.00000,2988.371336,4488.109700,3714.512814,910.0,910.0,910.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462771,0.050069,0.050348,0.386811,0.050001,0.350000,0.075000,0.021277,0.021277,0.500000,0.500000,0.350000,0.350000,0.350,-0.200000,-0.200000,-0.200000,0.443939,-0.015152,0.056061,0.015152,1170.0
2.0,660.0,7.0,181.0,0.519337,1.0,0.644231,5.0,2.0,1.0,0.0,4.613260,3.0,0.0,0.0,0.0,1.0,0.0,0.0,217.0,735.0,473.666667,1400.0,617900.0,215533.33330,1044.500000,3231.238439,2498.783924,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069391,0.066905,0.726183,0.066743,0.070778,0.578512,0.244008,0.044199,0.005525,0.888889,0.111111,0.385511,0.136364,1.000,-0.400000,-0.400000,-0.400000,0.000000,0.000000,0.500000,0.000000,6265.0
3.0,552.0,9.0,862.0,0.465089,1.0,0.635478,16.0,0.0,1.0,0.0,4.725058,7.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2000.0,377.333333,0.0,843300.0,146642.85710,0.000000,6417.240000,2543.161561,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.028869,0.028572,0.742863,0.170934,0.028762,0.510478,0.195651,0.059165,0.023202,0.718310,0.281690,0.465324,0.050000,1.000,-0.264444,-0.750000,-0.100000,0.000000,0.000000,0.500000,0.000000,121.0
4.0,559.0,10.0,1015.0,0.447503,1.0,0.636986,12.0,2.0,1.0,0.0,4.434483,10.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,586.0,235.555556,0.0,690400.0,102570.00000,0.000000,3776.977855,2572.800545,1100.0,1100.0,1100.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.020216,0.020014,0.020007,0.020318,0.919446,0.518714,0.214039,0.066995,0.015764,0.809524,0.190476,0.390143,0.100000,1.000,-0.277083,-0.800000,-0.008333,0.000000,0.000000,0.500000,0.000000,841.0
5.0,573.0,8.0,129.0,0.666667,1.0,0.790123,3.0,0.0,1.0,0.0,4.674419,7.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,469.0,259.400000,0.0,690400.0,117200.00000,0.000000,3335.052308,2029.340699,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.028901,0.028580,0.029893,0.030580,0.882047,0.327778,-0.165741,0.015504,0.038760,0.285714,0.714286,0.237500,0.100000,0.375,-0.393333,-0.700000,-0.166667,0.000000,0.000000,0.500000,0.000000,376.0
6.0,149.0,9.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.0,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,952.0,154.428571,21900.0,843300.0,363285.71430,3098.573478,9494.000000,5069.770031,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028572,0.544207,0.028572,0.370077,0.028572,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,1.000000,0.500000,0.500000,0.500000,16034.0
7.0,702.0,8.0,234.0,0.595652,1.0,0.746377,12.0,4.0,1.0,0.0,4.594017,6.0,0.0,0.0,0.0,1.0,0.0,0.0,217.0,11900.0,2337.333333,8000.0,69100.0,30983.33333,2167.275862,11900.000000,4744.522336,2100.0,5700.0,3900.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436433,0.033365,0.033470,0.463240,0.033492,0.650317,0.280357,0.025641,0.008547,0.750000,0.250000,0.500000,0.500000,0.500,-0.135714,-0.200000,-0.071429,0.300000,0.250000,0.200000,0.250000,2116.0
8.0,111.0,10.0,437.0,0.527907,1.0,0.708502,5.0,2.0,2.0,0.0,4.681922,4.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,95.0,23.000000,141400.0,843300.0,449825.00000,2179.890998,4459.275956,3159.607252,1400.0,33100.0,17250.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.295844,0.050022,0.552990,0.050343,0.050801,0.532667,0.088000,0.025172,0.016018,0.611111,0.388889,0.400000,0.100000,0.600,-0.314286,-1.000000,-0.050000,0.687500,-0.062500,0.187500,0.062500,1577.0
9.0,709.0,10.0,289.0,0.614035,1.0,0.769231,6.0,1.0,1.0,0.0,4.775087,5.0,0.0,0.0,1.0,0.0,0.0,0.0,217.0,2100.0,890.333333,0.0,51900.0,14540.00000,0.000000,2800.000000,1440.709455,1200.0,1200.0,1200.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.839994,0.040002,0.040001,0.040001,0.040001,0.505186,-0.012831,0.013841,0.010381,0.571429,0.428571,0.349091,0.136364,0.600,-0.312500,-0.500000,-0.125000,0.727273,0.068182,0.227273,0.068182,961.0
10.0,188.0,11.0,154.0,0.678322,1.0,0.721649,2.0,2.0,10.0,0.0,4.584416,8.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,633.0,221.125000,4100.0,843300.0,380700.00000,1707.769231,15994.000000,6257.756750,4100.0,4100.0,4100.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.025000,0.151537,0.158004,0.640454,0.025004,0.434722,0.188426,0.051948,0.025974,0.666667,0.333333,0.612500,0.100000,1.000,-0.377083,-0.800000,-0.125000,0.750000,-0.150000,0.250000,0.150000,1169.0


In [9]:
#remove outliers
# filter the data frame to remove the values exceeding 3 standard deviations
train_remove_df = train_df[(np.abs(stats.zscore(train_df)) < 3).all(axis=1)]

# what rows were removed
removed_rows = train_df.index.difference(train_remove_df.index)

train_df = train_remove_df
train_df

  return (a - mns) / sstd
  return (a - mns) / sstd
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1


# Defining the combinations to test

In [15]:
#Transforming variables

# Transform variables functions
def logVar(var):
    var += 1
    var = np.log(var)
    return var

def poweredToThree(var):
    var = var**3
    return var
    
def thirdRoot(var):
    var = var**(1.0/3)
    return var
    
def zero(var):
    var = 0
    return var

new_train_df = train_df.apply(lambda x: thirdRoot(x) if x.name == 'shares' else x)
# Call functions
#train_df.apply(lambda x: logVar(x['self_reference_avg_sharess']),axis=1)
#train_df.apply(lambda x: poweredToThree(x['global_subjectivity']),axis=1)
new_train_df

Unnamed: 0_level_0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1.0,198.0,6.0,47.0,0.914894,1.0,0.964286,1.0,1.0,0.0,2.0,4.744681,4.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,321.0,121.000000,100300.0,843300.0,415450.00000,2988.371336,4488.109700,3714.512814,910.0,910.0,910.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462771,0.050069,0.050348,0.386811,0.050001,0.350000,0.075000,0.021277,0.021277,0.500000,0.500000,0.350000,0.350000,0.350,-0.200000,-0.200000,-0.200000,0.443939,-0.015152,0.056061,0.015152,10.537282
2.0,660.0,7.0,181.0,0.519337,1.0,0.644231,5.0,2.0,1.0,0.0,4.613260,3.0,0.0,0.0,0.0,1.0,0.0,0.0,217.0,735.0,473.666667,1400.0,617900.0,215533.33330,1044.500000,3231.238439,2498.783924,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069391,0.066905,0.726183,0.066743,0.070778,0.578512,0.244008,0.044199,0.005525,0.888889,0.111111,0.385511,0.136364,1.000,-0.400000,-0.400000,-0.400000,0.000000,0.000000,0.500000,0.000000,18.434882
3.0,552.0,9.0,862.0,0.465089,1.0,0.635478,16.0,0.0,1.0,0.0,4.725058,7.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2000.0,377.333333,0.0,843300.0,146642.85710,0.000000,6417.240000,2543.161561,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.028869,0.028572,0.742863,0.170934,0.028762,0.510478,0.195651,0.059165,0.023202,0.718310,0.281690,0.465324,0.050000,1.000,-0.264444,-0.750000,-0.100000,0.000000,0.000000,0.500000,0.000000,4.946087
4.0,559.0,10.0,1015.0,0.447503,1.0,0.636986,12.0,2.0,1.0,0.0,4.434483,10.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,586.0,235.555556,0.0,690400.0,102570.00000,0.000000,3776.977855,2572.800545,1100.0,1100.0,1100.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.020216,0.020014,0.020007,0.020318,0.919446,0.518714,0.214039,0.066995,0.015764,0.809524,0.190476,0.390143,0.100000,1.000,-0.277083,-0.800000,-0.008333,0.000000,0.000000,0.500000,0.000000,9.439131
5.0,573.0,8.0,129.0,0.666667,1.0,0.790123,3.0,0.0,1.0,0.0,4.674419,7.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,469.0,259.400000,0.0,690400.0,117200.00000,0.000000,3335.052308,2029.340699,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.028901,0.028580,0.029893,0.030580,0.882047,0.327778,-0.165741,0.015504,0.038760,0.285714,0.714286,0.237500,0.100000,0.375,-0.393333,-0.700000,-0.166667,0.000000,0.000000,0.500000,0.000000,7.217652
6.0,149.0,9.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.0,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,952.0,154.428571,21900.0,843300.0,363285.71430,3098.573478,9494.000000,5069.770031,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028572,0.544207,0.028572,0.370077,0.028572,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,1.000000,0.500000,0.500000,0.500000,25.216257
7.0,702.0,8.0,234.0,0.595652,1.0,0.746377,12.0,4.0,1.0,0.0,4.594017,6.0,0.0,0.0,0.0,1.0,0.0,0.0,217.0,11900.0,2337.333333,8000.0,69100.0,30983.33333,2167.275862,11900.000000,4744.522336,2100.0,5700.0,3900.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436433,0.033365,0.033470,0.463240,0.033492,0.650317,0.280357,0.025641,0.008547,0.750000,0.250000,0.500000,0.500000,0.500,-0.135714,-0.200000,-0.071429,0.300000,0.250000,0.200000,0.250000,12.838232
8.0,111.0,10.0,437.0,0.527907,1.0,0.708502,5.0,2.0,2.0,0.0,4.681922,4.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,95.0,23.000000,141400.0,843300.0,449825.00000,2179.890998,4459.275956,3159.607252,1400.0,33100.0,17250.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.295844,0.050022,0.552990,0.050343,0.050801,0.532667,0.088000,0.025172,0.016018,0.611111,0.388889,0.400000,0.100000,0.600,-0.314286,-1.000000,-0.050000,0.687500,-0.062500,0.187500,0.062500,11.639757
9.0,709.0,10.0,289.0,0.614035,1.0,0.769231,6.0,1.0,1.0,0.0,4.775087,5.0,0.0,0.0,1.0,0.0,0.0,0.0,217.0,2100.0,890.333333,0.0,51900.0,14540.00000,0.000000,2800.000000,1440.709455,1200.0,1200.0,1200.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.839994,0.040002,0.040001,0.040001,0.040001,0.505186,-0.012831,0.013841,0.010381,0.571429,0.428571,0.349091,0.136364,0.600,-0.312500,-0.500000,-0.125000,0.727273,0.068182,0.227273,0.068182,9.868272
10.0,188.0,11.0,154.0,0.678322,1.0,0.721649,2.0,2.0,10.0,0.0,4.584416,8.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,633.0,221.125000,4100.0,843300.0,380700.00000,1707.769231,15994.000000,6257.756750,4100.0,4100.0,4100.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.025000,0.151537,0.158004,0.640454,0.025004,0.434722,0.188426,0.051948,0.025974,0.666667,0.333333,0.612500,0.100000,1.000,-0.377083,-0.800000,-0.125000,0.750000,-0.150000,0.250000,0.150000,10.534279


In [17]:

# Define the combination group
group0 = ['data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_bus']
group1 = ['weekday_is_saturday', 'is_weekend']
group2 = ['kw_avg_avg']
group3 = ['LDA_03']
group4 = ['data_channel_is_world']
group5 = ['abs_title_sentiment_polarity', 'title_subjectivity']
group6 = ['self_reference_avg_sharess']
group7 = ['global_subjectivity']
group8 = ['num_imgs']
group9 = ['num_keywords']
  
group = [group0, group1, group2, group3, group4, group5, group6, group7, group8, group9]


# Create the different combinations
combinations = []
for i in group0:
    for j in group1:
        for k in group2:
            for l in group3:
                for m in group4:
                    for n in group5:
                        for o in group6:
                            for p in group7:
                                for q in group8:
                                    for r in group9:
                                        combinations.append([i, j, k, l, m, n, o, p, q, r])


# The numb of combinations has to be the multplication of the number of variables of each group
numbCombinations = 1
for i in range(0, len(group)):
    numbCombinations = numbCombinations * len(group[i])
# Check its okay
print('Number of combinations created: ',len(combinations))
print('Condition is met:', len(combinations) == numbCombinations)

Number of combinations created:  12
Condition is met: True


In [19]:
# Save the combination groups
def saveCombinationGroup(varGroups):
    with open('Combination_Study/allCombinationsGroups.json') as json_file:
        combinationGroupsList = json.load(json_file)
    # Set the new combination number
    combinationGroupNumber = len(combinationGroupsList)
    # Create the new combination dictionary
    combinationGroup = {'combination' : combinationGroupNumber}
    for i in range(0, len(varGroups)):
        combinationGroup['group{}'.format(i)] = varGroups[i]
    # Add combination do combinationList
    combinationGroupsList.append(combinationGroup)
    # Save combinationsList and new combination
    with open('Combination_Study/allCombinationsGroups.json'.format(combinationGroupNumber), 'w') as outfile:
            json.dump(combinationGroupsList, outfile)
    with open('Combination_Study/combinationGroup_{}.json'.format(combinationGroupNumber), 'w') as outfile:
            json.dump(combinationGroup, outfile)
    
    return combinationGroup, combinationGroupNumber


# Call the saveCombinationGroup in json dictionary
combinationGroup, combinationGroupNumber = saveCombinationGroup(group)
print(combinationGroupNumber)
group

9


[['data_channel_is_entertainment',
  'data_channel_is_tech',
  'data_channel_is_bus'],
 ['weekday_is_saturday', 'is_weekend'],
 ['kw_avg_avg'],
 ['LDA_03'],
 ['data_channel_is_world'],
 ['abs_title_sentiment_polarity', 'title_subjectivity'],
 ['self_reference_avg_sharess'],
 ['global_subjectivity'],
 ['num_imgs'],
 ['num_keywords']]

In [25]:
# Stats
def retrieveStats(y_real, y_pred, x_used_to_pred):
    real_avg_shares = y_real.mean()
    r2_pred = 0
    r2_real = 0
    for index, value in y_real.iteritems():
        r2_pred += (real_avg_shares - y_pred[index])**2 #y_pred[index] = previsão para o indice atual
        r2_real += (real_avg_shares - y_real[index])**2 #y_real[index] = shares reais para o indice atual
    r2 = r2_pred/ r2_real
    n_observations = len(y_real)
    n_variables_used = len(x_used_to_pred.columns)
    adj_r2 = 1 - (1-r2)*(n_observations - 1)/(n_observations - 1 - n_variables_used)
    
    mae = metrics.mean_absolute_error(y_real, y_pred)
    rmse = math.sqrt(metrics.mean_squared_error(y_real, y_pred))
    
    try:
        rmsle = math.sqrt(metrics.mean_squared_log_error(y_real, y_pred))
    except:
        rmsle = -1
    
    return r2, adj_r2, mae, rmse, rmsle, n_observations, n_variables_used

# Create a prediction (x_predict = dataframe with idependent variable used to predict y_predict values; linear model in dictionary form)
def modelPredict(x_predict, linear_model):
    y = 0
    y_predict = []
    for row in X.index:
        y = model_coeff['const']
        for column in X.columns:
            y += X[column][row]*model_coeff[column]
        y_predict.append(y)
    # Create a series from the list
    y_predict = pd.Series(y_predict, index = X.index)
    y_predict.index.name = 'id'
    return y_predict

# Retrieve model dictionary with coeff
def retrieveModelCoeff(regression_model):
    model_coeff = {}
    for key in regression_model.params.keys(): #loops through all variables
        model_coeff[key] = regression_model.params[key]

    for column in train_df.drop('shares', axis = 1):
        if column not in model_coeff.keys():
            model_coeff[column] = 0
    return model_coeff

# Regression Model Training Function

In [28]:
# Value prediction function (x_predict = idependent variable data used to predict y_predict values; linear model in dictionary form)
def modelPredict(x_predict, linear_model):
    y_predict = []
    for row in x_predict.index:
        y = linear_model['const']
        for column in x_predict.columns:
            y += x_predict[column][row]*linear_model[column]
        y_predict.append(y)
    return y_predict


# Model training function (X = independent variables data, Y = independent variable data)
def createModel(X, Y, train_data, description):
    
    # Create models' dataframe with all variables but empty
    models = train_data.drop('shares', axis = 1).iloc[0:0]

    # Add column for the constant (on first column)
    models.insert(0, 'const', [], True)

    # Iteration for regression creation
    for i in range(0,1000):
        
        # Add constant to X to creat linear model constant
        X = sm.add_constant(X)   
        
        # Split X and Y into train and test parts 
        # Using 70% of the data to train the model and 30% to test it
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)
        
        
        # Train a linear regression using Ordinary Least Squares
        regression = sm.OLS(y_train, x_train)

        # Fit the regression model
        regression_model = regression.fit()

        # Predict Y values for x_test
        y_pred = regression_model.predict(x_test)


         # Retrieve Coefficient Dicitionary
        model_coeff = retrieveModelCoeff(regression_model)

        # Add the new model to models
        models = models.append(model_coeff, ignore_index=True)

    
    # Create the final model dictionary from the averages of the iterations
    final_model = {}
    for column in models.columns:
        final_model[column] = models[column].mean()
        # change null coefficients to 0
        if math.isnan(final_model[column]):
            final_model[column] = 0

    # Predict results with final model
    final_pred = modelPredict(X, final_model)
    
    # Retrive Stats for final model
    r2, adj_r2, mae, rmse, rmsle, n_observations, n_variables_used = retrieveStats(Y, final_pred, X)
    
    # Add Stats to the model
    final_model['r2'] = r2
    final_model['adj_r2'] = adj_r2
    final_model['mae'] = mae
    final_model['rmse'] = rmse
    final_model['rmsle'] = rmsle
    final_model['n_variables_used'] = n_variables_used
    #final_model['f_pvalue'] = f_pvalue
    
    # Add description
    final_model['variables_used'] = ';'.join(X.columns)
    
    
    return final_model, final_pred, models


# Training Regression Models for each Combination

In [29]:
# Create dataframe for the models created with each combination
regressionModels_df = pd.read_csv('Combination_Study/base_df.csv', index_col = 'model')

# Create list for the models created with each combination
regressionModels_list = []

for combination in combinations:
    
    # Defining Dependent Variable (Y) and Independent Variables (x)
    X = train_df.drop('shares', axis = 1)
    Y = train_df['shares']
    
    # Drop independet variables not part of the combination
    for var in X.columns:
        if var not in combination:
            X.drop(var, axis=1, inplace=True)
    
    # Description for model
    description = {}
    description['usedVariables'] = ';'.join(combination)
    description['trasnformedVariables'] = ''
    description['uesdComponents'] = ''
    description['otherDescription'] = 'removed outliers with >3*std_deviations, which gave a dataset of 2600 observations'
    
    
    finalModel, finalPrediction, models = createModel(X, Y, train_df, description)
    
    # Add final model of combination to regressions list
    regressionModels_list.append(finalModel)
    
    # Add final model of combination to regression models dataframe
    regressionModels_df = regressionModels_df.append(finalModel, ignore_index=True)
    regressionModels_df.index.rename('model', inplace = True)

    
# Create .json file with list for the models created with each combination of combination group
with open('Combination_Study/combinationGroup_{}_models.json'.format(combinationNummber), 'w') as outfile:
    json.dump(regressionModels_list, outfile)
# Create .csv file dataframe for the models created with each combination of combination group
regressionModels_df.to_csv('Combination_Study/combinationGroup_{}_models.csv'.format(combinationNummber))

regressionModels_df

TypeError: list indices must be integers or slices, not float

# Find the Best Combination Model

In [234]:
min_rmse = 100000000
bestIndex = 0
for index, row in regressionModels_df.iterrows():
    if row['r2'] > 0.10:
        if row['rmse'] < min_rmse:
            min_rmse = row['rmse']
            bestIndex = index
print(bestIndex )

#delete all rows in regressions except for best
bestModel = regressionModels_df.copy()
for index, row in bestModel.iterrows():
    if index != bestIndex :
        bestModel = bestModel.drop(index)
bestModel = bestModel.reset_index(drop=True)

bestModel

10


Unnamed: 0,const,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,adj_r2,f_pvalue,mae,mse,r2,rmse,otherDescription,trasnformedVariables,uesdComponents,usedVariables
0,-1274.147,0,0,0,0,0,0,23.688415,0,26.65894,0,0,4.231398,0,0.0,0,0,739.578,132.235569,0,0,0.012016,0,0,0,0,0,0.63996,0,0,0.058071,0,0,0,0,0,0,1.195155e-13,0,0,0,0,359.21405,0,1992.052023,0,0,0,0,0,0,0,0,0,0,758.022913,247.374,0,0,0.0,0.102715,1.2369510000000001e-25,1692.881541,8809132.0,0.108618,2962.354989,"removed outliers with >3*std_deviations, which...",,,data_channel_is_tech;weekday_is_sunday;kw_avg_...


In [235]:
# Add model to main regressions
regressionModels_list[bestIndex]
regressions_df = regressions_df.append(regressionModels_list[bestIndex], ignore_index=True)
regressions_df.index.rename('model', inplace = True)
regressions_df.to_csv('regressions_df.csv')
regressions_df

Unnamed: 0_level_0,const,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,adj_r2,f_pvalue,mae,mse,r2,rmse,otherDescription,trasnformedVariables,uesdComponents,usedVariables
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
0,-1012.188225,0.0,46.683661,0.0,1009.842921,0.0,0.0,0.0,0.0,33.655876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-291.684753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.95338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,919.938111,0.0,0.0,0.0,0.0,0.0,1282.21568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040068,0.001204237,3505.473114,1148845000.0,0.046794,28504.784125,,kw_avg_avg = ln(kw_avg_avg); num_imgs = ln(num...,,"kw_avg_avg, LDA_03, weekday_is_saturday, data_..."
1,-1162.855284,0.0,54.809451,0.0,923.750597,0.0,0.0,0.0,0.0,31.892128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-272.007833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,844.136453,0.0,0.0,0.0,0.0,0.0,1269.662814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041349,0.0007022607,3504.226627,1156378000.0,0.048067,28311.649868,,kw_avg_avg = ln(kw_avg_avg); num_imgs = ln(num...,,kw_avg_avg;LDA_03;weekday_is_saturday;data_cha...
2,-342.809186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.140975,0.0,0.0,0.0,0.0,0.0,-431.606145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.093481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,647.628096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1300.522753,0.038499,0.0009682324,3085.267947,65432410.0,0.043311,8080.459258,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
3,-884.260504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,662.983191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.11192,0.0,0.0,0.018092,0.0,0.0,0.0,0.0,0.0,0.0,128.2921,0.0,0.0,0.0,0.0,1582.070384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1768.593275,0.040066,1.3154880000000002e-29,2920.824805,39175780.0,0.041712,6259.05545,,,,data_channel_is_tech;weekday_is_sunday;kw_avg_...
4,-134.72375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.199458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-424.978274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.001576,0.0,0.0,0.010249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,742.818043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1346.536546,0.03542,2.24163e-16,3031.92136,64181240.0,0.037074,7930.673209,,,,data_channel_is_world;is_weekend;kw_avg_avg;nu...
5,-996.917913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-901.450956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.900326,0.0,0.0,0.010534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730.822535,0.0,0.0,0.0,1756.028695,0.0,2293.032731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1218.312524,0.039301,6.001128e-18,3031.898507,64808280.0,0.041223,7969.082564,,,,data_channel_is_entertainment;is_weekend;kw_av...
6,-923.041002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.870101,0.0,0.0,0.0,0.0,-957.666395,0.0,0.0,0.0,0.0,0.0,0.0,-0.101773,0.0,0.0,0.0,0.0,0.0,0.894212,0.0,0.0,0.00971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,736.317535,0.0,0.0,-183.039114,1557.265985,0.0,1854.423953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1514.118911,586.147401,0.0,0.0,0.0,0.038932,1.69593e-16,3027.101088,64224390.0,0.041953,7937.290262,,ln(self_reference_avg_sharess+1);(global_subje...,,data_channel_is_entertainment;is_weekend;kw_av...
7,-1101.585854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.606413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,344.293437,-99.194909,0.0,0.0,-0.118896,0.0,0.0,0.0,0.0,0.0,0.923732,0.0,0.0,0.009815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,735.163607,0.0,0.0,0.0,1367.907753,0.0,1604.402775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1523.378296,0.0,0.0,0.0,1201.478893,0.037668,8.859988e-16,3030.163444,64322610.0,0.040693,7943.085445,,,,data_channel_is_tech;is_weekend;kw_avg_avg;LDA...
8,-2018.350774,0.0,0.0,0.0,0.0,0.0,0.0,-7.929985,0.0,26.959527,0.0,0.0,104.534878,0.0,0.0,0.0,0.0,314.54367,0.0,0.0,0.0,-0.163697,0.0,0.0,0.0,0.0,0.0,0.95995,0.0,0.0,0.009182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,708.211293,0.0,0.0,189.452877,1450.897608,0.0,1835.117485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1571.61409,520.328097,0.0,0.0,0.0,0.037648,5.463665e-15,3034.708657,64417240.0,0.041223,7945.742399,,,,data_channel_is_tech;is_weekend;kw_avg_avg;LDA...
9,-1274.147,0.0,0.0,0.0,0.0,0.0,0.0,23.688415,0.0,26.65894,0.0,0.0,4.231398,0.0,0.0,0.0,0.0,739.578464,132.235569,0.0,0.0,0.012016,0.0,0.0,0.0,0.0,0.0,0.63996,0.0,0.0,0.058071,0.0,0.0,0.0,0.0,0.0,0.0,1.195155e-13,0.0,0.0,0.0,0.0,359.21405,0.0,1992.052023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,758.022913,247.37404,0.0,0.0,0.0,0.102715,1.2369510000000001e-25,1692.881541,8809132.0,0.108618,2962.354989,"removed outliers with >3*std_deviations, which...",,,data_channel_is_tech;weekday_is_sunday;kw_avg_...


In [236]:
# Creat model prediction and add to main predictions
modelNumber = len(predictions_df.columns)
prediction = modelPredict(validation_df, regressionModels_list[bestIndex])
predictions_df = loadDataframe('predictions_df.csv', 1000)
predictions_df[str(modelNumber)] = prediction
predictions_df.to_csv('predictions_df.csv')
predictions_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5001,1677.773769,1675.347157,2175.195109,1800.441681,1800.441681,2240.982859,2056.533133,2036.042264,1561.385954,2124.934448
5002,3252.921173,3267.982286,3596.937389,3590.802994,3590.802994,3698.198802,3534.786602,3772.432755,3578.807565,3465.959113
5003,1836.524527,1795.896092,1834.678355,1889.499322,1889.499322,1935.996712,1796.889690,1686.075148,1834.678908,1365.960482
5004,3826.708399,3779.499541,2409.133565,2460.743284,2460.743284,4130.180971,4599.696858,4222.204521,4463.022110,1912.437572
5005,2914.191017,2900.339344,2573.689337,2539.407346,2539.407346,3092.372563,3467.977728,2587.603088,3405.270615,2079.455357
5006,2311.801890,2240.607174,2178.616899,2257.170852,2257.170852,2476.605153,2367.990327,2464.896316,2668.011270,2481.627506
5007,3383.043272,3388.549400,3456.421755,3413.495905,3413.495905,3035.188686,3248.636309,3325.102056,3553.168772,3352.486065
5008,3583.709253,3634.423882,3621.229699,3834.287092,3834.287092,3673.499602,3762.834721,3381.795356,3468.079236,4001.268625
5009,2920.817682,2954.280652,3921.942476,3440.172218,3440.172218,3407.674174,3483.192316,3243.593384,3034.332826,2194.755746
5010,2537.676491,2554.940567,2756.593332,2754.862473,2754.862473,1722.705370,1846.173798,2410.484738,2406.095407,1822.551890


In [239]:
# Create prediction file to upload
def createUploadPrediction(modelNumber):
    sample_df = predictions_df.copy()
    for column in sample_df.columns:
        if column != str(modelNumber):
            sample_df = sample_df.drop(column, axis = 1)
    sample_df.rename(columns={str(modelNumber): "Prediction"}, inplace = True)
    sample_df.to_csv('Upload_Predictions/model_{}.csv'.format(modelNumber))
    return sample_df

upload_prediction = createUploadPrediction(9)
upload_prediction

Unnamed: 0_level_0,Prediction
id,Unnamed: 1_level_1
5001,2124.934448
5002,3465.959113
5003,1365.960482
5004,1912.437572
5005,2079.455357
5006,2481.627506
5007,3352.486065
5008,4001.268625
5009,2194.755746
5010,1822.551890
