In [86]:
import json

import numpy as np
import math
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as seabornInstance 

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

# Load DataFrames and lists

In [87]:

def loadDataframe(fileName, numbRows):
    # load .csv data into a dataframe (df) and make sure only important rows stay
    df = pd.read_csv(fileName)
    df = df[:numbRows]
    #remove column white space
    df.columns = df.columns.str.replace(' ', '')
    #drop url column (it doesn't matter for us)
    if 'url' in df.columns:
        df.drop('url', axis=1, inplace=True)
    # set the dataframe index to the id column
    if 'id' in df.columns:
        df = df.set_index('id')
    # Make sure all data is float
    df = df.astype(float)
    
    return df

# Retrieve Dataframes
train_df = loadDataframe('train.csv', 5000)
predictions_df = loadDataframe('predictions_df.csv', 1000)
regressions_df = pd.read_csv('regressions_df.csv', index_col = 'model')
validation_df = loadDataframe('validation.csv', 1000)
validation_df.drop('shares', axis=1, inplace=True)

# Retrieve regression json
with open('regressions.json') as json_file:
    regressions = json.load(json_file)

train_df

Unnamed: 0_level_0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,198.0,6.0,47.0,0.914894,1.0,0.964286,1.0,1.0,0.0,2.0,...,0.350000,0.350,-0.200000,-0.200000,-0.200000,0.443939,-0.015152,0.056061,0.015152,1170.0
2.0,660.0,7.0,181.0,0.519337,1.0,0.644231,5.0,2.0,1.0,0.0,...,0.136364,1.000,-0.400000,-0.400000,-0.400000,0.000000,0.000000,0.500000,0.000000,6265.0
3.0,552.0,9.0,862.0,0.465089,1.0,0.635478,16.0,0.0,1.0,0.0,...,0.050000,1.000,-0.264444,-0.750000,-0.100000,0.000000,0.000000,0.500000,0.000000,121.0
4.0,559.0,10.0,1015.0,0.447503,1.0,0.636986,12.0,2.0,1.0,0.0,...,0.100000,1.000,-0.277083,-0.800000,-0.008333,0.000000,0.000000,0.500000,0.000000,841.0
5.0,573.0,8.0,129.0,0.666667,1.0,0.790123,3.0,0.0,1.0,0.0,...,0.100000,0.375,-0.393333,-0.700000,-0.166667,0.000000,0.000000,0.500000,0.000000,376.0
6.0,149.0,9.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.0,...,0.000000,0.000,0.000000,0.000000,0.000000,1.000000,0.500000,0.500000,0.500000,16034.0
7.0,702.0,8.0,234.0,0.595652,1.0,0.746377,12.0,4.0,1.0,0.0,...,0.500000,0.500,-0.135714,-0.200000,-0.071429,0.300000,0.250000,0.200000,0.250000,2116.0
8.0,111.0,10.0,437.0,0.527907,1.0,0.708502,5.0,2.0,2.0,0.0,...,0.100000,0.600,-0.314286,-1.000000,-0.050000,0.687500,-0.062500,0.187500,0.062500,1577.0
9.0,709.0,10.0,289.0,0.614035,1.0,0.769231,6.0,1.0,1.0,0.0,...,0.136364,0.600,-0.312500,-0.500000,-0.125000,0.727273,0.068182,0.227273,0.068182,961.0
10.0,188.0,11.0,154.0,0.678322,1.0,0.721649,2.0,2.0,10.0,0.0,...,0.100000,1.000,-0.377083,-0.800000,-0.125000,0.750000,-0.150000,0.250000,0.150000,1169.0


# Defining The Data

In [88]:
# Defining Dependent Variable (Y) and Inependent Variables (x)
#X = train_df.drop('shares', axis = 1)
#Y = train_df['shares']


# Select independet variables to use function
def useVar(X, variables_to_use):
    for column in X.columns:
        if column not in variables_to_use:
            X.drop(column, axis=1, inplace=True)
    # Description for model
    description = {}
    description['usedVariables'] = ';'.join(variables_to_use)
    description['trasnformedVariables'] = ''
    description['uesdComponents'] = ''
    description['otherDescription'] = ''
    return description

# Transform variables functions
def logVar(var):
    if var != 0:
        var = np.log(var)




# Call functions
#useVar(usedVariables)
#X.apply(lambda x: logVar(x['num_imgs']),axis=1)
#X.apply(lambda x: logVar(x['kw_avg_avg']),axis=1)


# Train Rgression Model

In [89]:

def modelPredict(X, model_dict):
    y_predict = []
    for row in X.index:
        y = model_dict['const']
        for column in X.columns:
            y += X[column][row]*model_dict[column]
        y_predict.append(y)
    return y_predict




def createModel(X, regressions_df, description):
    # Giving a number to the model
    numbModel = len(predictions_df.columns)

    # Create models' dataframe with all variables but empty
    models = train_df.drop('shares', axis = 1).iloc[0:0]

    # Add column for the constant (on first column)
    models.insert(0, 'const', [], True)

    # Iteration for regression creation
    for i in range(0,1000):
        X = sm.add_constant(X) # necessary for code to work    
        
        # Split X and Y into train and test parts (uso 80% dos dados para calcular/ treinar a regressão e testo em 80% resto, a seleção é aleatória)
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)


        #Fit a linear model using Ordinary Least Squares
        regression_model = sm.OLS(y_train, x_train)

        #fit the data
        est = regression_model.fit()

        # predict values
        y_pred = est.predict(x_test)

        # Retrieve coef and stats
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
        r2 = est.rsquared
        adj_r2 = est.rsquared_adj
        f_pvalue = est.f_pvalue

        #creating coeff dictionary
        coeff = {}
        for key in est.params.keys(): #loops through all variables
            coeff[key] = est.params[key]

        # Add the metrics to the dictionary (new_model beacuse it is a row to add to models)
        new_model = coeff
        new_model['r2'] = r2
        new_model['adj_r2'] = adj_r2
        new_model['f_pvalue'] = f_pvalue
        new_model['mae'] = mae
        new_model['mse'] = mse
        new_model['rmse'] = rmse

        # Add new_model to models
        models = models.append(new_model, ignore_index=True)

    # Save the iterations in .csv
    models.to_csv('Iterations/model_{}_iterations.csv'.format(numbModel))
    
    # Create the final model from the averages of the iterations
    final_model_dict = {}
    for column in models.columns:
        final_model_dict[column] = models[column].mean()
        # change to 0 null coefficients
        if math.isnan(final_model_dict[column]):
            final_model_dict[column] = 0
    # Add model description
    final_model_dict['usedVariables'] = description['usedVariables']
    final_model_dict['trasnformedVariables'] = description['trasnformedVariables']
    final_model_dict['uesdComponents'] = description['uesdComponents']
    final_model_dict['otherDescription'] = description['otherDescription']

    # Add new model to regressions dataframe
    regressions.append(final_model_dict)
    #update .json file
    with open('regressions.json', 'w') as outfile:
        json.dump(regressions, outfile)
    # Add new model to regressions dataframe
    regressions_df = regressions_df.append(final_model_dict, ignore_index=True)
    regressions_df.index.rename('model', inplace = True)
    # update .csv file
    regressions_df.to_csv('regressions_df.csv')

    # Predict results
    final_pred = modelPredict(validation_df, final_model_dict)
    # Add predicted results to predictions Dataframe
    predictions_df[str(numbModel)] = final_pred
    # update .csv file
    predictions_df.to_csv('predictions_df.csv')
    
    return regressions_df, models

# Calling model function
#regressions_df, models = createModel(X, regressions_df)


# Models Analysis

In [90]:
#regressions[len(regressions)-1]

In [91]:
#regressions_df

In [92]:
#predictions_df

In [93]:
# Create prediction file to upload
def createUploadPRediction(modelNumber):
    sample_df = predictions_df.copy()
    for column in sample_df.columns:
        if column != str(modelNumber):
            sample_df = sample_df.drop(column, axis = 1)
    sample_df.rename(columns={str(modelNumber): "Prediction"}, inplace = True)
    sample_df.to_csv('Upload_Predictions/model_{}.csv'.format(modelNumber))
    return sample_df

#upload_prediction = createUploadPRediction(2)
#upload_prediction

In [94]:
def dropUnwantedData(predictions_df,regressions_df):
    #dropping all columns
    for column in predictions_df.columns:
        if column != '0' and column !='1' :
            predictions_df = predictions_df.drop(column, axis = 1)
    predictions_df.to_csv('predictions_df.csv')

    regressions_df = regressions_df[:2]
    regressions_df.to_csv('regressions_df.csv', index=False)
#dropUnwantedData(predictions_df,regressions_df)


# Getting the best regression model from combinations of correlated variables

In [95]:
group1 = ['data_channel_is_entertainment', 'data_channel_is_world', 'data_channel_is_tech', 'data_channel_is_bus']
group2 = ['weekday_is_sunday', 'is_weekend']
group3 = ['kw_avg_avg', 'kw_max_avg', 'kw_min_avg']
group4 = ['LDA_03', 'num_imgs']
group5 = ['abs_title_sentiment_polarity', 'title_subjectivity']
group6 = ['self_reference_avg_sharess']

var_list = []
for i in group1:
    for j in group2:
        for k in group3:
            for l in group4:
                for m in group5:
                    for n in group6:
                        var_list.append([i, j, k, l, m, n])

for vars_to_use in var_list:
    # Defining Dependent Variable (Y) and Inependent Variables (x)
    X = train_df.drop('shares', axis = 1)
    Y = train_df['shares']
    descritpition = useVar(X, vars_to_use)
    regressions_df, models = createModel(X, regressions_df, description)

regressions_df

Unnamed: 0,const,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,adj_r2,f_pvalue,mae,mse,r2,rmse,otherDescription,trasnformedVariables,uesdComponents,usedVariables
0,-1012.188225,0.0,46.683661,0.0,1009.842921,0.0,0.0,0.0,0.0,33.655876,...,0.040068,0.001204,3505.473114,1.148845e+09,0.046794,28504.784125,,kw_avg_avg = ln(kw_avg_avg); num_imgs = ln(num...,,"kw_avg_avg, LDA_03, weekday_is_saturday, data_..."
1,-1162.855284,0.0,54.809451,0.0,923.750597,0.0,0.0,0.0,0.0,31.892128,...,0.041349,0.000702,3504.226627,1.156378e+09,0.048067,28311.649868,,kw_avg_avg = ln(kw_avg_avg); num_imgs = ln(num...,,kw_avg_avg;LDA_03;weekday_is_saturday;data_cha...
2,-51.516832,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.040374,0.000624,3077.804689,6.574614e+07,0.045177,8100.681501,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
3,-42.873961,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.040408,0.000847,3074.224561,6.610914e+07,0.045211,8123.315186,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
4,-272.266899,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,39.463081,...,0.038440,0.001148,3067.965626,6.615698e+07,0.043253,8126.183419,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
5,-315.922395,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,40.290602,...,0.038017,0.001425,3072.554442,6.592614e+07,0.042832,8112.472412,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
6,1816.599838,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.032462,0.003704,3109.066182,6.685847e+07,0.037304,8169.092800,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
7,1792.885783,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.032177,0.004283,3108.236219,6.685153e+07,0.037021,8168.912055,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
8,1925.769596,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,52.606326,...,0.024627,0.030239,3121.835407,6.718455e+07,0.029509,8188.282343,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
9,1910.717020,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,53.145102,...,0.023980,0.031386,3127.900861,6.683294e+07,0.028865,8167.021108,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...


In [113]:
# Running variable combinations

combinationNummber = len(all_combinations)-1
group1 = ['data_channel_is_entertainment', 'data_channel_is_world', 'data_channel_is_tech', 'data_channel_is_bus']
group2 = ['weekday_is_sunday', 'is_weekend']
group3 = ['kw_avg_avg', 'kw_max_avg', 'kw_min_avg']
group4 = ['LDA_03', 'num_imgs']
group5 = ['abs_title_sentiment_polarity', 'title_subjectivity']
group6 = ['self_reference_avg_sharess']
combination = {'combination' : combinationNummber, 'group 1': group1 ,'group 2': group2,'group 3': group3,'group 4': group4,'group 5' : group5}
with open('Combination_Study/combination_{}.json'.format(combinationNummber), 'w') as outfile:
        json.dump(combination, outfile)

In [137]:
min_rmse = 10000
bestIndex = 0
for index, row in regressions_df.iterrows():
    if row['r2'] > 0.04:
        if row['rmse'] < min_rmse:
            min_rmse = row['rmse']
            bestIndex = index
print(bestIndex )

#delete all rows in regressions except 0, 1 and 88
new = regressions_df.copy()
for index, row in new.iterrows():
    if index != 0 and index != 1 and index != bestIndex :
        new = new.drop(index)
new = new.reset_index(drop=True)
new.to_csv('regressions_df.csv')

#regressions_df.iloc[bestIndex]
#regressions[bestIndex]
#predictions_df[str(bestIndex)]

# delete all columns in predictions except 1, 2 and best index
new_pred = predictions_df.copy()
for column in new_pred.columns:
    if column != '0' and column != '1' and column != str(bestIndex):
        new_pred = new_pred.drop(column, axis = 1)
new_pred  = new_pred.rename(columns={str(bestIndex): str(len(new_pred.columns)-1)})
new_pred.to_csv('predictions_df.csv')


#delete all regressions in json except for 
new_regr = []
for i in range(0,len(regressions)):
    if i == 0 or i == 1 or i == bestIndex:
        new_regr.append(regressions[i])
with open('regressions.json', 'w') as outfile:
        json.dump(new_regr, outfile)

88


In [112]:

combination_df.to_csv('Combination_Study/All_Models/combination_1.csv', index = True)
combination_df = pd.read_csv('Combination_Study/All_Models/combination_1.csv', index_col = 0 )
combination_df

Unnamed: 0,const,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,adj_r2,f_pvalue,mae,mse,r2,rmse,otherDescription,trasnformedVariables,uesdComponents,usedVariables
0,-51.516832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.040374,0.000624,3077.804689,6.574614e+07,0.045177,8100.681501,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
1,-42.873961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.040408,0.000847,3074.224561,6.610914e+07,0.045211,8123.315186,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
2,-272.266899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.463081,...,0.038440,0.001148,3067.965626,6.615698e+07,0.043253,8126.183419,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
3,-315.922395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.290602,...,0.038017,0.001425,3072.554442,6.592614e+07,0.042832,8112.472412,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
4,1816.599838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.032462,0.003704,3109.066182,6.685847e+07,0.037304,8169.092800,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
5,1792.885783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.032177,0.004283,3108.236219,6.685153e+07,0.037021,8168.912055,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
6,1925.769596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.606326,...,0.024627,0.030239,3121.835407,6.718455e+07,0.029509,8188.282343,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
7,1910.717020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.145102,...,0.023980,0.031386,3127.900861,6.683294e+07,0.028865,8167.021108,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
8,2302.156158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.021712,0.008346,3129.461629,6.678625e+07,0.026608,8164.330707,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
9,2287.061567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.021072,0.011869,3133.904368,6.645119e+07,0.025972,8143.526243,,,,kw_avg_avg;LDA_03;is_weekend;data_channel_is_e...
