# Data Pre-processing & Feature Selection

In [1]:
###########################################################################
#
#  Copyright 2021 Google Inc.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
# This solution, including any related sample code or data, is made available 
# on an “as is,” “as available,” and “with all faults” basis, solely for 
# illustrative purposes, and without warranty or representation of any kind. 
# This solution is experimental, unsupported and provided solely for your 
# convenience. Your use of it is subject to your agreements with Google, as 
# applicable, and may constitute a beta feature as defined under those 
# agreements.  To the extent that you make any data available to Google in 
# connection with your use of the solution, you represent and warrant that you 
# have all necessary and appropriate rights, consents and permissions to permit 
# Google to use and process that data.  By using any portion of this solution, 
# you acknowledge, assume and accept all risks, known and unknown, associated 
# with its usage, including with respect to your deployment of any portion of 
# this solution in your systems, or usage in connection with your business, 
# if at all.
###########################################################################

## 0) Dependencies

In [None]:
import requests
import io
import os
import datetime
import logging

# Google credentials authentication libraries
! pip install google.colab
from google.colab import auth

!pip install --upgrade -q gspread 
import gspread

from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

# data processing libraries
import numpy as np
from numpy.core.numeric import NaN
import pandas as pd


from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, RFECV
from sklearn.feature_selection import f_classif, f_regression, chi2
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

# modeling and metrics
from scipy.optimize import least_squares
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm


import itertools
from scipy.stats.stats import pearsonr

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# BigQuery Magics
'''
BigQuery magics are used to run BigQuery SQL queries in a python environment.
These queries can also be run in the BigQuery UI
'''

from google.cloud import bigquery
from google.cloud.bigquery import magics
magics.context.project = '' #update your project name 
client = bigquery.Client(project=magics.context.project)
%load_ext google.cloud.bigquery
bigquery.USE_LEGACY_SQL = False


## 1) Import dataset

In [None]:
%%bigquery df
SELECT *
FROM `.RBA_demo.SAMPLE_DATA`; #update with project name

In [None]:
df.head()

In [None]:
'''
Immediately remove variables that won't be used in the model. Here, includes things
like geo which is consistent across the dataset, and aggregated media such as 
total clicks across DSPs
'''
df.drop(columns = ['geo','date','x1','x2','x18','x19','x20','x21','x22','x23','x24','x25'], inplace = True)

In [None]:
len(df.columns)

In [None]:
df.describe()

In [None]:
stop

## 2) Data Cleaning

### 2.1) Drop or impute missing data

In [None]:
'''
Determine number of missing values (% of total column) and sort by highest to lowest
'''
missing_values = 100*df.isnull().sum()/len(df)
missing_values.sort_values(ascending = False)

In [None]:
'''
Set a threshold for % of missing values. If the % of missing data exceeds this
threshold, the column will be dropped.

Removing variables with a high % of missing data is important so as not to include
uninformative features in the model.
'''

remove_missing_vals = "Yes"
missing_value_threshold = 50

missing_value_threshold_cols = []

if remove_missing_vals == "Yes":
  for i in missing_values[missing_values >= missing_value_threshold].index:
    missing_value_threshold_cols += i
    df.drop(columns = i, inplace = True)
else:
  print('No columns dropped')

print('Remaining columns:',df.columns)

In [None]:
# Imputation Method: Mean value, Median Value, Zero
imputation_method = 'Zero'

In [None]:
if imputation_method == 'mean value':
  df.fillna(df.mean(), inplace = True)
elif imputation_method == 'median value':
  df.fillna(df.median(), inplace = True)
elif imputation_method == 'Zero':
  df.fillna(0, inplace = True)
else:
  df.fillna(0, inplace = True)


## 3) Define Y (KPI column) and create initial feature set

In [None]:
df.head()

In [None]:
#Input column names for date (ex: "day") and Y (ex: "new_accounts" or "sales") 

date_col = "date" #{type: "string"}
kpi_col = "y1" #{type: "string"}


In [None]:
featureset_df = df[df.columns[df.columns != kpi_col]]
featureset_df = featureset_df[featureset_df.columns[featureset_df.columns != date_col]]

In [None]:
featureset_df.head()

## 4) Visualize Series

In [None]:
for i in range(2,len(featureset_df.columns)):
  plt.figure()
  sns.kdeplot(featureset_df[featureset_df.columns[i]], label = featureset_df.columns[i], shade = True)

## 5) Feature Transformation

In [None]:
#@title Input transformation type (ex: log, sqrt)
transformation = 'Square Root' #@param ["Square Root", "Mean","Log","Natural Log","None"]

In [None]:
# These transformations stabilize large values in the dataset, and allow the data to be more normal
if transformation == "Square Root":
  X_transform = featureset_df.apply(np.sqrt)
elif transformation == "Mean":
  X_transform = featureset_df/featureset_df.mean()
elif transformation == "Natural Log": 
  featureset_df += 1 # natural log function is not defined at 0 so need to add 1 to each value 
  X_transform = featureset_df.apply(np.log)
else:
  X_transform = featureset_df

### 5.1) Feature Scaling

In [None]:
'''
If you'd like to standardize your features, set the following parameter to "yes".
We have options for different scaling options: Min-Max Scaler or Standard Scaler.

If parameter is set to "No" or other value, no standardization will be performed.
'''


standardize_features = "Yes"

In [None]:
if standardize_features == "No":
  featureset_df = X_transform
elif standardize_features == "Yes":
  #scaler = StandardScaler()
  scaler = MinMaxScaler()
  #standardized_transform = scaler.fit_transform(X_transform)
  standardized_transform = MinMaxScaler().fit_transform(X_transform)
  featureset_df = pd.DataFrame(standardized_transform, columns = X_transform.columns)
else:
  featureset_df = X_transform

In [None]:
'''
Option to review visuals of the data. After the data is standardized the distributions
may take on a more normal shape.


for i in range(0,len(X_transform_stand.columns)):
  plt.figure()
  sns.kdeplot(X_transform_stand[X_transform_stand.columns[i]], label = X_transform_stand.columns[i], shade = True)
'''

In [None]:
featureset_df.head()

## 6) Handle Multicollinearity (reduce feature set)

In [None]:
'''
Note: A max of 50 features is recommended for 24 months of data due to risk of overfitting. 

Run correlation analysis and output positive/negative coefficients above specified
threshold. Look at highly correlated variables and drop

Run variance inflation factor analysis and output results to flag multicollinearity
above specified threshold
'''

In [None]:
'''
Print a correlation heatmap to visualize correlation across feature set
'''

correl = featureset_df.corr()

# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(correl)

# using the upper triangle matrix as mask 
sns.heatmap(correl, mask=matrix)

In [None]:
#Input correlation threshold for output (must be between 0 and 1)

THRESHOLD_1 =  .6

In [None]:
# output correlations > threshold_1 to review for data reduction

corrs_pos = correl.where(np.triu(np.ones(correl.shape)).astype(np.bool)) # zeros out lower triangle of matrix
corrs_pos = corrs_pos[corrs_pos > THRESHOLD_1].stack().reset_index() # user defines threshold and can increase to be more conservative
corrs_pos = corrs_pos.loc[corrs_pos[0]<1.0]
corrs_pos

In [None]:
corrs_pos.level_0.value_counts()

In [None]:
corrs_pos.level_1.value_counts()

In [None]:
#Input 2nd correlation threshold (must be between -1 and 0)
THRESHOLD_2 =  -.6 #@param {type: "number"}

In [None]:
# feature reduction 1B: output correlations < threshold_2 to review for data reduction 
corrs_neg = correl.where(np.triu(np.ones(correl.shape)).astype(np.bool)) # zeros out lower triangle of matrix
corrs_neg = corrs_neg[corrs_neg < THRESHOLD_2].stack().reset_index() # user defines threshold and can decrease to be more conservative
corrs_neg = corrs_neg.loc[corrs_neg[0]<1.0]
corrs_neg

In [None]:
corrs_neg.level_0.value_counts()

In [None]:
corrs_neg.level_1.value_counts()

In [None]:
'''
drop highly correlated columns after manual review of above output
'''
featureset_df.drop(columns = ['x3','x4','x5','x6','x7','x31'], inplace = True)

In [None]:
featureset_df.columns

In [None]:
featureset_df.drop(columns = ['x13','x15','x17','x36'], inplace = True)

In [None]:
'''
Run VIF analysis and flag values greater than 7
Industry best practice flags values above 10 as a violation of regression model 
assumptions
'''

vif = add_constant(featureset_df)

# loop to calculate the VIF for each X 
vif = pd.Series([variance_inflation_factor(vif.values, i) 
      for i in range(vif.shape[1])], 
      index=vif.columns) 
print(vif.sort_values(ascending=False))

### 6.1) Additional data reduction

In [None]:
'''
If there are still too many features, one can run a regression, analyze
the significant predictors using p-value analysis and remove any
insignificant features.
'''

In [None]:
'''

# add column of 1's to estimate intercept parameter
featureset_df = sm.add_constant(featureset_df) 

# define Y
Y = df[[kpi_col]]
X = featureset_df
# run OLS model using statsmodels package for p-values
model = sm.OLS(Y, X)
results = model.fit()
predictions = model.predict(results.params) 

print_model = results.summary()
print_model
'''

In [None]:
'''
Set a p-value threshold. This removes features with p-values greater than the threshold.

# Input p-Value threshold
p_threshold =  .1 #@param {type: "number"}

# FINAL feature selection/data reduction
results_df = (results.summary2().tables[1])
sig_input = list(results_df[results_df['P>|t|']<=P_THRESHOLD].index)[1:]
X_inputs = df[sig_input] 
X_inputs.columns

'''

In [None]:
'''
If there are still too many features, one can run K-Best analysis.
This selects top features based on univariate testing

k_best = SelectKBest(f_regression, k=len(X_inputs.columns))
k_best.fit_transform(X_inputs, Y)
'''

In [None]:
'''
If there are still too many features, one can run recursive feature elimination (RFE),
which ranks features based on recursive model testing

lm = LinearRegression()
rfe = RFE(lm, 1)
X_rfe = rfe.fit_transform(X_inputs,Y)
lm.fit(X_rfe,Y)
'''

In [None]:
'''
Output results of optional feature selection process

# create dataframe with KBest and RFE results
opt_datared_df = pd.DataFrame({'feature': X_inputs.columns, 'p_value': k_best.pvalues_, 'rfe_rank': rfe.ranking_}).sort_values('p_value')
opt_datared_df
'''

In [None]:
'''
# Input p-Value and rank thresholds for optional data reduction
#P_THRESHOLD_OPT =  .1 #@param {type: "number"}
#RANK_THRESHOLD =  1 #@param {type: "number"}

# OPTIONAL feature selection/data reduction

#opt_reduction = opt_datared_df[((opt_datared_df['p_value']<=P_THRESHOLD_OPT) | 
                             #(opt_datared_df['rfe_rank'] <= RANK_THRESHOLD))] 

# create list of FINAL model inputs, overwriting previous input feature list.
#opt_inputs =[]   

#for index, rows in opt_reduction.iterrows(): # iterate over each row 
#    opt_inputs.append(rows.feature) 

#X_inputs = X_inputs[opt_inputs]                           
'''


## 7) Export Final Dataset

In [None]:
final_df = featureset_df
final_df['y1'] = df['y1']
pandas_gbq.to_gbq(final_df, destination_table = 'RBA_demo.cleaned_data_', project_id='', if_exists = 'replace') #update project name

## 8) Additional Considerations

In [None]:
'''
Time lags: does media on day 1 impact conversions on day 2 (and so on)

Diminishing returns: large increases in spend do not necessarily result in 1:1
increases in conversions

External factors: are there peak periods that should be accounted for? (i.e.
flags for promotions, creatives, price changes in market, etc.)

L1 regularization: a systematic way to remove insignificant inputs and prevents 
overfitting


The following code is in progress and still requires testing, but can be used
to add lag and decay features)
'''

In [None]:
'''
After adding in lag, decay, etc. features will need to be re-examined for 
multicollinearity and potential feature reduction prior to modeling.
'''

'''
#This function creates the different combinations of Lag, Decay, and Curve
def Transformation(dataframe,x):
    lag = []
    for i in range(0, 5, 1):
        data = dataframe[x].shift(i).to_frame()
        data.columns = [col_name+'Lag'+str(i)for col_name in data.columns]
        # store DataFrame in list
        lag.append(data)
    # see pd.concat documentation for more info
    lag = pd.concat(lag,axis=1)
    lag=lag.fillna(0)
    Alpha = []
    for i in np.linspace(0.6,1.0,num=5):
        data = pow(lag,i)
        data.columns = [col_name+'Alpha'+str(i)for col_name in data.columns]
    # store DataFrame in list
        Alpha.append(data)
    # see pd.concat documentation for more info
    Alpha = pd.concat(Alpha,axis=1)  
    Decay=[]
    #j = 0
    for percent in np.linspace(0.6,1.0,5):
        data = Alpha.copy()
        data.columns = [col_name+'Decay'+str(percent)for col_name in data.columns]
        for i in range(0,len(Alpha)):
            for j in range(0,len(Alpha.columns)):
            #data = data + data.shift(1)*(1-i)
                if(i == 0):
                    data.iloc[i, j] = data.iloc[i, j]*percent
                else:
                    data.iloc[i, j] = data.iloc[i - 1, j] *(1-percent) + data.iloc[i,j] * percent
        Decay.append(data)
        j = j + 1
    Decay = pd.concat(Decay,axis=1)
    
    return Decay
'''