In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
from zero_churn_model.imports import * # basic imports
from zero_churn_model import helpers as hp

from scipy.stats import pearsonr 
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import t 
from copy import deepcopy as copy
from tqdm import tqdm
from scipy.stats import chi2_contingency

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load Data

In [50]:
fn = 'model-data-pca-chi2-obsolete-fold-f28f32e6.parquet'
fp = hp.data_path('processed', fn, validate=False)
df = pd.read_parquet(fp)
df.head(5)

Unnamed: 0,y,0,1,2,3,4,5,6,7,8,...,2445,2446,2447,2448,2449,2450,2451,2452,2453,2454
0,0,0.949638,7.839033,-3.325831,1.166039,0.898431,-3.722978,-3.724379,-2.138678,1.649482,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0,-5.759383,15.966196,-3.491803,-1.913483,2.193621,-2.567054,-8.970534,5.853391,-0.845518,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0,-2.338906,-6.589302,-0.671392,9.972804,-2.382445,5.158881,-0.942877,-0.157386,0.065975,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0,5.883239,-10.595399,-7.679696,1.201918,-0.242519,1.32782,0.764475,-2.752347,0.683654,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1,-3.259075,-4.480679,5.01675,-5.977697,0.735466,-6.407818,3.70209,-1.342238,0.404511,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# Create Dummy Features for Categorical Columns

Note that I've already done this in a previously-submitted assignment. 

In [51]:
cat_cols = []

for col in df.columns:
    if np.sort(df[col].unique()).tolist() == [0, 1] and col != 'y':
        cat_cols.append(col)

In [52]:
df[cat_cols].head(10)

Unnamed: 0,694,695,696,697,698,699,700,701,702,703,...,2445,2446,2447,2448,2449,2450,2451,2452,2453,2454
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# Standardize Numeric Features

In [53]:
numeric_cols = [x for x in df.columns if x not in cat_cols and x != 'y']
for col in numeric_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

In [54]:
df[numeric_cols].head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,684,685,686,687,688,689,690,691,692,693
0,0.070869,0.621334,-0.424505,0.181605,0.152176,-0.640144,-0.686168,-0.407014,0.344215,-0.175584,...,0.665383,-1.009962,0.705341,-1.191091,0.592972,-0.315849,-1.437366,-0.222335,0.102875,-0.171008
1,-0.429807,1.265507,-0.44569,-0.298016,0.371555,-0.44139,-1.652703,1.113964,-0.176443,-1.296109,...,-1.273571,0.947474,-1.798641,-1.001721,0.574378,0.429238,0.089671,0.044631,-0.883547,1.659238
2,-0.174546,-0.522279,-0.085696,1.553218,-0.403538,0.88704,-0.173713,-0.029952,0.013768,0.193253,...,0.933507,-0.103305,-0.117983,-0.734125,0.933039,-0.956443,-0.935864,-2.244506,0.752829,-0.185223
3,0.43905,-0.839809,-0.980228,0.187193,-0.041078,0.228311,0.140844,-0.523802,0.142665,-0.041964,...,-0.236249,-0.764604,-0.651457,-0.106265,0.125235,-0.265088,0.038972,0.510399,0.072987,-0.042516
4,-0.243216,-0.355146,0.640332,-0.930998,0.124573,-1.101787,0.682062,-0.255442,0.084414,0.307976,...,-0.26404,0.15812,1.022788,-0.054315,0.536816,-0.152642,0.097002,-0.174061,-0.489203,0.007813


# Split Data into Training and Test Datasets

In [57]:
feature_cols = [x for x in df.columns if x != 'y']
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['y'], test_size=0.33, 
                                                    random_state=42)

In [None]:
fp = hp.data_path('processed', 'X_train-lethal-whereas.parquet', validate=False)
X_train.to_parquet(fp)

fp = hp.data_path('processed', 'X_test-lethal-whereas.parquet', validate=False)
X_test.to_parquet(fp)

fp = hp.data_path('processed', 'y_train-lethal-whereas.parquet', validate=False)
pd.DataFrame(y_train).to_parquet(fp)

fp = hp.data_path('processed', 'y_test-lethal-whereas.parquet', validate=False)
pd.DataFrame(y_test).to_parquet(fp)