# Feature Engineering and Selection

1. Prep data / Feature Creation
1. Recode the Categorical Variable - weighted (Danielle's method) (Numeric Variables + Danielle's Features)
    1. Load `train_1000_toy.parquet` for further use 
1. Recode the Categorical Variable - weighted (YoungKoungs's method) (Numeric Variables + YoungKoung's Features)

* purpose of this notebook is to create a toy data set with exploratory features that can then be save as a parquet file and read in for use for exploratory model building. 

* Run once start to finish to create the toy dataset parquet file. This code is not optimized and is a bit brittle. 

* Had to create new parquet files. Overwriting had some problems



## Setup

In [1]:
# imports
import re
import ast
import time
import shutil
import os
import copy
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import display

from pyspark.sql import Window, Row
from pyspark.sql.functions import col, desc, mean, isnan, when, count, isnull, rank, sum, countDistinct, avg, stddev, round, lit, rand, monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.types import LongType

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler, Imputer

from pyspark.mllib.stat import Statistics
from pyspark.ml.stat import Correlation

from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "final_project"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .config("spark.jars", "/path/to/gcs-connector-hadoop2-latest.jar")\
        .getOrCreate()
sc = spark.sparkContext
spark

In [5]:
# # start Spark Session
# from pyspark.sql import SparkSession
# app_name = "final_project"
# master = "local[*]"
# spark = SparkSession\
#         .builder\
#         .appName(app_name)\
#         .master(master)\
#         .getOrCreate()
# sc = spark.sparkContext
# spark

REMINDER: If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

In [6]:
sc = spark.sparkContext

In [91]:
# read parquet file
# will focus on 10,000 for most of our EDA
start = time.time()
train_10_million_df = spark.read.parquet("gs://gsod_23456/data/train_10_million.parquet")
train_5_million_df = spark.read.parquet("gs://gsod_23456/data/train_5_million.parquet")
train_1_million_df = spark.read.parquet("gs://gsod_23456/data/train_1_million.parquet")
train_100000_df    = spark.read.parquet("gs://gsod_23456/data/train_100000.parquet")
train_10000_df     = spark.read.parquet("gs://gsod_23456/data/train_10000.parquet")
train_1000_df      = spark.read.parquet("gs://gsod_23456/data/train_1000.parquet")
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 8.75810194015503 seconds.

... completed job in 3.145395517349243 seconds.


In [12]:
# # read parquet file
# # will focus on 10,000 for most of our EDA
# start = time.time()
# train_1_million_df = spark.read.parquet("data/train_1_million.parquet")
# train_100000_df    = spark.read.parquet("data/train_100000.parquet")
# train_10000_df     = spark.read.parquet("data/train_10000.parquet")
# train_1000_df      = spark.read.parquet("data/train_1000.parquet")
# print(f'... completed job in {time.time() - start} seconds.')

In [164]:
# The copy is not mutable
df = spark.read.parquet("gs://gsod_23456/data/train.parquet")

In [165]:
df.take(1)

[Row(_c1=None, _c2=139, _c3=None, _c4=2, _c5=4828, _c6=28, _c7=11, _c8=2, _c9=28, _c10=None, _c11=1, _c12=0, _c13=2, _c14='05db9164', _c15='d833535f', _c16='77f2f2e5', _c17='d16679b9', _c18='4cf72387', _c19='fe6b92e5', _c20='9ea2e0f0', _c21='0b153874', _c23='43b7a3fa', _c24='2dad6ba2', _c25='9f32b866', _c26='47cb697a', _c27='07d13a8f', _c28='943169c2', _c29='31ca40b6', _c30='e5ba7672', _c31='281769c2', _c32=None, _c33=None, _c34='dfcfc3fa', _c35='c9d4222a', _c36='32c7478e', _c37='aee52b6f', _c38=None, _c39=None, _c0=0, _c22='a73ee510')]

<br>

## Numeric Transformations

In [166]:
# making all columns floats
df_num = df.select(*(col(i).cast("float").alias(i) for i in df.columns[1:14]))

In [167]:
# imputing the mean for the null values
imputer = Imputer(strategy = "mean", inputCols = df_num.columns, 
                  outputCols = [i for i in df_num.columns])
df_num_mean = imputer.fit(df_num).transform(df_num)

In [168]:
df_num_zero = df_num.fillna(0)

<br>

## Categorical Transformations

### Recode Categories as Average of Dependent Variable

In [169]:
df_cat = df

In [170]:
start = time.time()

frames = {}
for i in df_cat.columns[14:]:
    recode_df = df_cat.groupby(i, '_c0').count().sort(desc('count'))
    recode_df_2 = recode_df.groupBy(i).agg((sum(recode_df['_c0'] * recode_df['count'])/sum(recode_df['count'])).alias(i + "_wv")).sort(i)
    frames['frame{}'.format(i)] = recode_df_2
    
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 1.5797109603881836 seconds. 10000
# ... completed job in 1.22529935836792 seconds.  100000
# ... completed job in 1.1574714183807373 seconds. 1mil
# ... completed job in 1.1070666313171387 seconds. 5 mil
# ... completed job in 1.5035927295684814 seconds. 10 mil

... completed job in 1.1267173290252686 seconds.


In [171]:
# Showing one example
start = time.time()
frames['frame{}'.format('_c15')].show()
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 4.743180513381958 seconds. 10000
# ... completed job in 1.9786813259124756 seconds.100000
# ... completed job in 3.5676050186157227 seconds. 1 mil
# ... completed job in 3.299978494644165 seconds. 5 mil
# ... completed job in 3.268929958343506 seconds. 10 mil
# ... completed job in 6.468371868133545 seconds. full

+--------+-------------------+
|    _c15|            _c15_wv|
+--------+-------------------+
|00ac063c|0.16605466628739882|
|014e4174| 0.3184774753071549|
|016cbb4f|0.15769017212659633|
|01d108a8|0.38713910761154857|
|023a27f8|0.22647420834732013|
|028bd518|0.12851043643263757|
|02de4366|  0.391008174386921|
|0363d860|0.24634070383058237|
|038ac0e2| 0.3299492385786802|
|04236da6| 0.2510460251046025|
|04440d29|0.17782948313112848|
|0468d672|0.15128299330862008|
|04e09220|0.41324485098325126|
|051a26e5|0.32195599758890897|
|053c35c0| 0.2504708097928437|
|06174070| 0.2680912712785223|
|064c8f31|0.31504843079205247|
|069b6d24|0.16120644825793032|
|076c38e2|0.17625899280575538|
|083aa75b|0.10343163862251975|
+--------+-------------------+
only showing top 20 rows

... completed job in 6.468371868133545 seconds.


In [172]:
start = time.time()

# The columns will get longer and longer without this specification
recode_cols = df_cat.columns[14:]

for i in recode_cols:
    df_cat = df_cat.join(frames['frame{}'.format(i)], on = i, how = "left")
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 4.176668167114258 seconds. 10000
# ... completed job in 3.8941476345062256 seconds.100000
# ... completed job in 3.8023264408111572 seconds. 1 mi
# ... completed job in 3.7413086891174316 seconds. 5 mil
# ... completed job in 3.768989086151123 seconds. 10 mil

... completed job in 3.7436633110046387 seconds.


In [173]:
# only keeping the weighted value categories
df_cat_pre = df_cat.select(df_cat.columns[40:])

In [174]:
start = time.time()

# imputing the mean for the null values
imputer = Imputer(strategy = "mean", inputCols = df_cat_pre.columns, 
                  outputCols = [i for i in df_cat_pre.columns])
df_cat_wv = imputer.fit(df_cat_pre).transform(df_cat_pre)

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 37.89948034286499 seconds. 10000
# ... completed job in 32.890904664993286 seconds.100000
# ... completed job in 38.56995725631714 seconds. 1 mil
# ... completed job in 317.81101274490356 seconds. 5 mil
# ... completed job in 422.4889533519745 seconds. 10 mil 
# ... completed job in 1770.5356452465057 seconds. 29.5 minutes for full

... completed job in 1770.5356452465057 seconds.


<br>

### Recode Categories as Average of Dependent Variable

In [175]:
df_cat_2 = df

In [176]:
start = time.time()

frames = {}
for i in df_cat_2.columns[14:]:         
    recode_df = df_cat_2.groupBy(i).agg((mean(df_cat_2['_c0'])).alias(i+"_p")).sort(desc(i+"_p"))    
    recode_df = recode_df.withColumn(i+'_wgt', when(col(i+"_p")> 0.2, 'H').otherwise('L'))
    frames['frame{}'.format(i)] = recode_df.select([i,i+'_wgt'])
    
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 1.0335583686828613 seconds. 10000
# ... completed job in 0.9747071266174316 seconds. 100000
# ... completed job in 0.9798829555511475 seconds. 1 mil
# ... completed job in 1.001121997833252 seconds.  5 mil
# ... completed job in 1.0045771598815918 seconds. 10 mil
# ... completed job in 1.378826379776001 seconds. full

... completed job in 1.378826379776001 seconds.


In [177]:
# Showing one example
start = time.time()
frames['frame{}'.format('_c30')].show()
print(f'... completed job in {time.time() - start} seconds.')

+--------+--------+
|    _c30|_c30_wgt|
+--------+--------+
|8efede7f|       H|
|e5ba7672|       H|
|27c07bd6|       H|
|3486227d|       H|
|07c540c4|       H|
|d4bb7bd8|       L|
|1e88c74f|       L|
|776ce399|       L|
|2005abd1|       L|
|af5d780c|       L|
+--------+--------+

... completed job in 12.459106683731079 seconds.


In [178]:
start = time.time()

# The columns will get longer and longer without this specification
recode_cols = df_cat_2.columns[14:]

for i in recode_cols:
    df_cat_2 = df_cat_2.join(frames['frame{}'.format(i)], on = i, how = "left")
print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 2.4024016857147217 seconds. 10000
# ... completed job in 2.331226348876953 seconds.  100000
# ... completed job in 2.632216215133667 seconds. 5 mil
# ... completed job in 2.607677459716797 seconds. 10 mil
# ... completed job in 3.0513346195220947 seconds. full

... completed job in 3.0513346195220947 seconds.


In [179]:
# only keeping the transformed categories
df_cat_wgt = df_cat_2.select(df_cat_2.columns[40:])

In [180]:
# change null to 'M' category
def null_as_Missing(x):
    return when(col(x).isNotNull(), col(x)).otherwise('M')

for i in df_cat_wgt.columns[14:]:
    df_cat_wgt = df_cat_wgt.withColumn(i, null_as_Missing(i))

<br>

# Combining Files

## Add indices to each dataframe for sorting (come back)

In [181]:
df_num_zero = df_num_zero.withColumn("index", monotonically_increasing_id())
df_num_mean = df_num_mean.withColumn("index", monotonically_increasing_id())
df_cat_wv = df_cat_wv.withColumn("index", monotonically_increasing_id())
df_cat_wgt = df_cat_wgt.withColumn("index", monotonically_increasing_id())

In [182]:
start = time.time()

df_zero_wv = df_num_zero.join(df_cat_wv, on = "index", how = "left")
df_mean_wv = df_num_mean.join(df_cat_wv, on = "index", how = "left")
df_zero_wgt = df_num_zero.join(df_cat_wgt, on = "index", how = "left")
df_mean_wgt = df_num_mean.join(df_cat_wgt, on = "index", how = "left")

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 1.1673033237457275 seconds. 10000
# ... completed job in 1.0087032318115234 seconds. 100000
# .. completed job in 0.986668586730957 seconds. 1 mil
# ... completed job in 1.0337064266204834 seconds. 5 mil
#  ... completed job in 1.0097198486328125 seconds. 10 mil
# ... completed job in 1.0341036319732666 seconds.

... completed job in 1.0341036319732666 seconds.


<br>

## Parquet all of the feature files

In [183]:
start = time.time()
directory_name="gs://gsod_23456/data/df_zero_wv_full.parquet"

# delete parquet directory if exists
if os.path.exists(directory_name):
    print('deleting', directory_name)
    shutil.rmtree(directory_name)

# Write `train_1000_toy.parquet` for posterity
# First, we are creating few partition buckets
df.write.partitionBy("_c0", "_c22").parquet(directory_name)

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 3.9202239513397217 seconds. 10000
# ... completed job in 3.2623648643493652 seconds. 100000
# ... completed job in 15.71683955192566 seconds. 1 mil
# ... completed job in 34.01933479309082 seconds. 5 mil
# ... completed job in 44.703779220581055 seconds. 10 mil
# ... completed job in 67.96506094932556 seconds full

... completed job in 67.96506094932556 seconds.


In [184]:
start = time.time()
directory_name="gs://gsod_23456/data/df_mean_wv_full.parquet"

# delete parquet directory if exists
if os.path.exists(directory_name):
    print('deleting', directory_name)
    shutil.rmtree(directory_name)

# Write `train_1000_toy.parquet` for posterity
# First, we are creating few partition buckets
df.write.partitionBy("_c0", "_c22").parquet(directory_name)

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 2.379305601119995 seconds. 10000
# ... completed job in 9.892587184906006 seconds. 100000
# ... completed job in 13.134225845336914 seconds.1 mil
# ... completed job in 29.67958402633667 seconds. 5 mil
# ... completed job in 29.74158763885498 seconds. 10 mil
# ... completed job in 42.87833905220032 seconds. fulls

... completed job in 42.87833905220032 seconds.


In [185]:
start = time.time()
directory_name="gs://gsod_23456/data/df_zero_wgt_full.parquet"

# delete parquet directory if exists
if os.path.exists(directory_name):
    print('deleting', directory_name)
    shutil.rmtree(directory_name)

# Write `train_1000_toy.parquet` for posterity
# First, we are creating few partition buckets
df.write.partitionBy("_c0", "_c22").parquet(directory_name)

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 1.602597713470459 seconds 10000.
# ... completed job in 15.576507091522217 seconds100000
# ... completed job in 9.965850353240967 seconds. 1 mil
# ... completed job in 25.147286891937256 seconds. 5 mil
# ... completed job in 24.53834342956543 seconds.10 mil
# ... completed job in 44.6361300945282 seconds. full

... completed job in 44.6361300945282 seconds.


In [186]:
start = time.time()
directory_name="gs://gsod_23456/data/df_mean_wgt_full.parquet"

# delete parquet directory if exists
if os.path.exists(directory_name):
    print('deleting', directory_name)
    shutil.rmtree(directory_name)

# Write `train_1000_toy.parquet` for posterity
# First, we are creating few partition buckets
df.write.partitionBy("_c0", "_c22").parquet(directory_name)

print(f'... completed job in {time.time() - start} seconds.')
# ... completed job in 0.9980945587158203 seconds. 10000
# ... completed job in 17.89224362373352 seconds.  101000
# ... completed job in 11.039973020553589 seconds. 1mil
# ... completed job in 24.255266427993774 seconds. 5 mil
# ... completed job in 24.572181463241577 seconds. 10 mil
# ... completed job in 41.63871169090271 seconds. full

... completed job in 41.63871169090271 seconds.
