In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
def convert_temporal_data(dataframe):
  '''
  Convert temporal variables from seconds to hours.

  This conversion is necessary. Otherwise, large predictor values (e.g., p^6)
  will be irrelevant for OLS.
  '''

  for column in ['p', 'r']:
    dataframe[column] = dataframe[column] / 3600

  return dataframe

def read_score_distribution(csv_file):
  columns = ['p', 'q', 'r', 'score']
  distribution = pd.read_csv(csv_file, names=columns)
  distribution = convert_temporal_data(distribution)

  return distribution

In [4]:
def add_quadratic_predictors(dataframe):
  dataframe[['p2', 'q2', 'r2']] = dataframe[['p', 'q', 'r']]**2
  dataframe['pq'] = dataframe['p']*dataframe['q']

  return dataframe

def add_cubic_predictors(dataframe):
  dataframe[['p3', 'q3', 'r3']] = dataframe[['p', 'q', 'r']]**3
  dataframe['p2q'] = dataframe['p']**2 * dataframe['q']
  dataframe['pq2'] = dataframe['p'] * dataframe['q']**2

  return dataframe

def add_quartic_predictiors(dataframe):
  dataframe[['p4', 'q4', 'r4']] = dataframe[['p', 'q', 'r']]**4
  dataframe['p3q'] = dataframe['p']**3 * dataframe['q']
  dataframe['p2q2'] = dataframe['p']**2 * dataframe['q']**2
  dataframe['pq3'] = dataframe['p'] * dataframe['q']**3

  return dataframe

def add_quintic_predictors(dataframe):
  dataframe[['p5', 'q5', 'r5']] = dataframe[['p', 'q', 'r']]**5
  dataframe['p4q'] = dataframe['p']**4 * dataframe['q']
  dataframe['p3q2'] = dataframe['p']**3 * dataframe['q']**2
  dataframe['p2q3'] = dataframe['p']**2 * dataframe['q']**3
  dataframe['pq4'] = dataframe['p'] * dataframe['q']**4

  return dataframe

def add_sextic_predictors(dataframe):
  dataframe[['p6', 'q6', 'r6']] = dataframe[['p', 'q', 'r']]**6
  dataframe['p5q'] = dataframe['p']**5 * dataframe['q']
  dataframe['p4q2'] = dataframe['p']**4 * dataframe['q']**2
  dataframe['p3q3'] = dataframe['p']**3 * dataframe['q']**3
  dataframe['p2q4'] = dataframe['p']**2 * dataframe['q']**4
  dataframe['pq5'] = dataframe['p'] * dataframe['q']**5

  return dataframe

In [5]:
def create_quadratic_polynomial(dataframe):
  dataframe = add_quadratic_predictors(dataframe)
  return dataframe

def create_cubic_polynomial(dataframe):
  dataframe = add_cubic_predictors(
              add_quadratic_predictors(dataframe))
  return dataframe

def create_quartic_polynomial(dataframe):
  dataframe = add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe)))
  return dataframe

def create_quintic_polynomial(dataframe):
  dataframe = add_quintic_predictors(
              add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe))))
  return dataframe

def create_sextic_polynomial(dataframe):
  dataframe = add_sextic_predictors(
              add_quintic_predictors(
              add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe)))))
  return dataframe

In [6]:
def compute_vif(features):
    vif = pd.DataFrame()
    vif['VIF'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
    vif['feature'] = features.columns
    return vif

In [8]:
csv_file = "data/scores/sdsc-blue-final-score.csv"
raw_dist = read_score_distribution(csv_file)

In [12]:
features_label = ['p', 'q', 'r']
target_label = ["score"]
features = raw_dist[features_label]
target = raw_dist[target_label]
features

Unnamed: 0,p,q,r
0,0.701667,16,0.405278
1,0.035000,8,0.405833
2,0.021667,8,0.456667
3,0.045278,32,0.481389
4,0.015833,8,0.493611
...,...,...,...
3195,0.074444,8,6.079444
3196,0.105833,8,6.208611
3197,0.157500,8,6.369167
3198,0.043611,64,6.535833


In [13]:
# compute_vif(features)
compute_vif(create_quintic_polynomial(features))

Unnamed: 0,VIF,feature
0,192.814003,p
1,87.039589,q
2,49.529964,r
3,9297.229322,p2
4,5533.399216,q2
5,1828.391951,r2
6,1190.362707,pq
7,72389.00299,p3
8,62516.139578,q3
9,10190.92248,r3
