# w261 Final Project - Feature Engineering 

### Goal of analysis

Go step-by-step from EDA to feature engineering process. Use techniques of one-hot encoding, assemblers and vectorizers to create feature engineered dataset to be used in algorithm implementation.

In [3]:
! pip install pyarrow



In [4]:
# General tools & operations libraries
import re
import ast
import time
import csv
import itertools

# Mathematical operations and dataframes libraries
import numpy as np
import pandas as pd

# Plotting and visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Parquet libraries
import pyarrow as pa
import pyarrow.parquet as pq

# PySpark libraries
from pyspark.sql import SQLContext
#from pyspark.sql import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.conf import SparkConf

from pyspark.sql.functions import lit, when, col, approx_count_distinct, udf, log, exp, abs, mean
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, OneHotEncoder, OneHotEncoderEstimator, VectorAssembler, FeatureHasher

#### Set parameters and Spark configurations

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [7]:
# assign parameters
!BUCKET=danielalvarez_w261projects

In [8]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "finalproject_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [9]:
# Spark configuration Information
for object in sc.getConf().getAll():
    print(object)

('spark.app.id', 'local-1575999037643')
('spark.rdd.compress', 'True')
('spark.serializer.objectStreamReset', '100')
('spark.master', 'local[*]')
('spark.executor.id', 'driver')
('spark.submit.deployMode', 'client')
('spark.app.name', 'finalproject_notebook')
('spark.ui.showConsoleProgress', 'true')
('spark.driver.host', 'docker.w261')
('spark.driver.port', '37599')


In [10]:
spark

# Exploratory Data Analysis

Determine 2-3 relevant EDA tasks that will help you make decisions about how you implement the algorithm to be scalable. Discuss any challenges that you anticipate based on the EDA you perform.

### Load dataset

Dataset represents 0.1% of the raw `train.txt` dataset.

In [11]:
#!cat train.txt | awk 'BEGIN {srand()} !/^$/ { if (rand() <= .001) print $0}' > data/sample.txt
#!gzip -cd data/dac.tar.gz | awk 'BEGIN {srand()} !/^$/ { if (rand() <= .001) print $0}' > data/sample.txt

Impose schema structure. 

The 13th variable is a numeric (`n13`), 14th variable is categorical (`cat14`)

In [12]:
# the 13th variable is a numeric (`n13`), 14th variable is categorical (`cat14`)
schema = StructType([
    StructField('y', IntegerType()), StructField('n1', IntegerType()),
    StructField('n2', IntegerType()), StructField('n3', IntegerType()),
    StructField('n4', IntegerType()), StructField('n5', LongType()),
    StructField('n6', IntegerType()), StructField('n7', IntegerType()),
    StructField('n8', IntegerType()), StructField('n9', IntegerType()),
    StructField('n10', IntegerType()), StructField('n11', IntegerType()),
    StructField('n12', IntegerType()), StructField('n13', IntegerType()), 
    StructField('cat14', StringType()), StructField('cat15', StringType()),
    StructField('cat16', StringType()), StructField('cat17', StringType()),
    StructField('cat18', StringType()), StructField('cat19', StringType()),
    StructField('cat20', StringType()), StructField('cat21', StringType()),
    StructField('cat22', StringType()), StructField('cat23', StringType()),
    StructField('cat24', StringType()), StructField('cat25', StringType()),
    StructField('cat26', StringType()), StructField('cat27', StringType()),
    StructField('cat28', StringType()), StructField('cat29', StringType()),
    StructField('cat30', StringType()), StructField('cat31', StringType()),
    StructField('cat32', StringType()), StructField('cat33', StringType()),
    StructField('cat34', StringType()), StructField('cat35', StringType()),
    StructField('cat36', StringType()), StructField('cat37', StringType()),
    StructField('cat38', StringType()), StructField('cat39', StringType()) ])

Create Spark Dataframe

In [13]:
start = time.time()
print('Creating dataframe..')
df = spark.read.load("data/sample.txt", format='csv', sep='\t', header='false', schema=schema)
print(f"... completed job in {time.time() - start} seconds")

Creating dataframe..
... completed job in 1.8539607524871826 seconds


Show the first 5 rows of selected columns

In [14]:
print(df.select('y','n1','n12','n13','cat14','cat39').show(n=5))

+---+----+----+---+--------+--------+
|  y|  n1| n12|n13|   cat14|   cat39|
+---+----+----+---+--------+--------+
|  0|   1|null|  2|05db9164|    null|
|  0|null|null|  1|05db9164|553d46e8|
|  0|   1|null|  2|be589b51|64f08cc6|
|  0|   0|null|  9|05db9164|    null|
|  0|   0|   0| 24|05db9164|    null|
+---+----+----+---+--------+--------+
only showing top 5 rows

None


In [15]:
# Count the number of rows
df.count()

46048

In [16]:
df.head(5)

[Row(y=0, n1=1, n2=2, n3=2, n4=2, n5=292, n6=2, n7=1, n8=2, n9=2, n10=1, n11=1, n12=None, n13=2, cat14='05db9164', cat15='0a519c5c', cat16='b00d1501', cat17='d16679b9', cat18='25c83c98', cat19='7e0ccccf', cat20='1683df22', cat21='0b153874', cat22='a73ee510', cat23='3b08e48b', cat24='89073265', cat25='e0d76380', cat26='8b266858', cat27='b28479f6', cat28='b760dcb7', cat29='1203a270', cat30='d4bb7bd8', cat31='2efa89c6', cat32=None, cat33=None, cat34='73d06dde', cat35=None, cat36='3a171ecb', cat37='aee52b6f', cat38=None, cat39=None),
 Row(y=0, n1=None, n2=-1, n3=86, n4=1, n5=6147, n6=875, n7=3, n8=1, n9=96, n10=None, n11=1, n12=None, n13=1, cat14='05db9164', cat15='4e8d18ed', cat16='a415cb70', cat17='554f4454', cat18='25c83c98', cat19='13718bbd', cat20='2045039f', cat21='0b153874', cat22='a73ee510', cat23='3b08e48b', cat24='5f8383cb', cat25='ef9491f3', cat26='86684160', cat27='1adce6ef', cat28='bbee52f9', cat29='90e3c135', cat30='e5ba7672', cat31='47e4d79e', cat32='9437f62f', cat33='a458ea

### Write dataframe to parquet file format

In [17]:
start = time.time()
print('Writing dataframe to parquet format..')

df.write.parquet('data/df.parquet', compression='snappy', mode='overwrite')
#df.write.parquet(OUT_FILES, compression='snappy', mode='overwrite')

print(f"... completed job in {time.time() - start} seconds")

Writing dataframe to parquet format..
... completed job in 2.378223180770874 seconds


### Read in parquet files

In [18]:
df_pq = spark.read.load('data/df.parquet')

In [19]:
# count the number of rows
print(df_pq.count())

# perform an assert to check number of rows matches before and after parquet conversion
print(df_pq.count() == df.count())

46048
True


#### Read a parquet file as a pandas dataframe

For subsequent analysis on the algorithm explanation (Section 2) and EDA (Section 3) we will use the dataframe created from a parquet partition.

In [20]:
#df_sample = pd.read_parquet('data/df.parquet/part-00000-a472c6d5-727b-4d11-b306-3bbb786d2b5b-c000.snappy.parquet', engine='pyarrow')

### Exploratory analysis

Count distinct values for each of the features

In [21]:
vars = ['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13',
        'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 
        'cat23', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30',
        'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39']

In [22]:
for v in vars:
    distinct_var = df_pq.select(v).distinct().count()
    print('Distinct values in {} column: {}'.format(str(v),distinct_var))

Distinct values in n1 column: 112
Distinct values in n2 column: 1920
Distinct values in n3 column: 442
Distinct values in n4 column: 85
Distinct values in n5 column: 15280
Distinct values in n6 column: 1401
Distinct values in n7 column: 492
Distinct values in n8 column: 67
Distinct values in n9 column: 1288
Distinct values in n10 column: 10
Distinct values in n11 column: 87
Distinct values in n12 column: 61
Distinct values in n13 column: 134
Distinct values in cat14 column: 361
Distinct values in cat15 column: 493
Distinct values in cat16 column: 23415
Distinct values in cat17 column: 14183
Distinct values in cat18 column: 102
Distinct values in cat19 column: 11
Distinct values in cat20 column: 6375
Distinct values in cat21 column: 180
Distinct values in cat22 column: 3
Distinct values in cat23 column: 8075
Distinct values in cat23 column: 8075
Distinct values in cat24 column: 3407
Distinct values in cat25 column: 21722
Distinct values in cat26 column: 2612
Distinct values in cat27 col

The distinct count analysis shows that the features `n5` , `cat16`, `cat17`, `cat25`, `cat29` and `cat34`, each have a large number of unique values (over 10K unique values). These values will be under-represented occurrences in the dataset requiring feature engineering.

Determine the number of distinct values in the dataset

In [23]:
dist_vals = df_pq.agg(*[approx_count_distinct(c).alias(c) for c in df_pq.columns])\
                .rdd.flatMap(lambda x: x)\
                .sum()

print(f'Approximate distinct values in the dataset - {dist_vals}')

Approximate distinct values in the dataset - 164840


In [24]:
# show distinct values for cat17 
print('distinct categories in cat17 column')
dist_cat17 = df_pq.select("cat17").distinct().count()
print(dist_cat17)

distinct categories in cat17 column
14183


In [25]:
# show distinct values for n1
print('distinct categories in n1 column')
dist_n1 = df_pq.select("n1").distinct().count()
print(dist_n1)

distinct categories in n1 column
112


In [26]:
# Examine first 5 rows of parquet file
df_pq.head(5)

[Row(y=0, n1=1, n2=1, n3=5, n4=3, n5=375, n6=39, n7=11, n8=37, n9=106, n10=1, n11=3, n12=None, n13=3, cat14='5a9ed9b0', cat15='89ddfee8', cat16='2655adf1', cat17='c6b0e462', cat18='25c83c98', cat19='3bf701e7', cat20='82ffe275', cat21='0b153874', cat22='a73ee510', cat23='39da7128', cat24='e4c6cf60', cat25='266ce9dd', cat26='c4c46dce', cat27='1adce6ef', cat28='c77661b0', cat29='1ac05730', cat30='e5ba7672', cat31='bf27d32b', cat32='3014a4b1', cat33='5840adea', cat34='3b57a504', cat35='ad3062eb', cat36='bcdee96c', cat37='d2e445d0', cat38='f0f449dd', cat39='96d4bf8f'),
 Row(y=0, n1=None, n2=5, n3=32, n4=2, n5=63925, n6=None, n7=0, n8=4, n9=2, n10=None, n11=0, n12=None, n13=2, cat14='05db9164', cat15='a07503cc', cat16='c8b9f273', cat17='13508380', cat18='4cf72387', cat19='fbad5c96', cat20='08a6c211', cat21='5b392875', cat22='7cc72ec2', cat23='00f2b452', cat24='41b3f655', cat25='5938b690', cat26='ce5114a2', cat27='07d13a8f', cat28='77660bba', cat29='5ba2964d', cat30='3486227d', cat31='912c7e2

## Feature engineering

#### Apply transformations to the variables

Set a criteria to filter our values of features with a count less than a determined threshold as discussed in the de Wit (2014) paper

In [27]:
#Category count threshold 
# Set threshold equal to 10,000
THRESHOLD = 500

In [28]:
target_col = 'y'
#numeric_columns = [f'_c{i}' for i in range(1, 13)]
#category_columns = [f'_c{i}' for i in range(13, 30)]

# Column names which will be transformed
category_columns = ['cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22',
           'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 
           'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39']

numeric_columns = ['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
           'n8', 'n9', 'n10', 'n11', 'n12', 'n13']

#### String Indexer

In [29]:
# StringIndexer requires string not boolean type
df_pq = df_pq.withColumn(target_col, col(target_col).cast('string'))

print(f' Total number of Rows = {df_pq.count()}')

 Total number of Rows = 46048


#### Define transformation function

Apply a function to filter each feature to contain unique values each with a threshold number of counts (filter out low occurrence values). For features with over a threshold number of unique values, we decide to create a stash variable that comprises the least represented values for each original feature.

In [30]:
def transform_str_col(df, cat_name):

    df_uniq_counts = df.groupBy(cat_name).count()

    # get values that occur above the threshold and broadcast it
    keep_vars = sc.broadcast(df_uniq_counts.filter(df_uniq_counts['count'] > THRESHOLD)
                             .select(df_uniq_counts[cat_name])
                             .rdd.flatMap(lambda x: x).collect())

    # broadcast the value to replace the low occurance values
    replace_val = sc.broadcast('stash_' + str(cat_name))

    # name the new column
    cat_t = str(cat_name) + '_t'

    df = df.withColumn(cat_t, when(col(cat_name).isin(
        keep_vars.value), col(cat_name)).otherwise(lit(replace_val.value)))
    df = df.drop(cat_name)
    return df


tot_time = 0
for c in category_columns:

    start = time.time()
    print(f'Transforming Categorical column.. {c}')
    df_pq = transform_str_col(df_pq, c)
    time_taken = time.time() - start
    print(f"... completed job in {time_taken} seconds")
    tot_time += time_taken
print(f'total time taken = {tot_time}')

df_pq.cache()

# transformed category column names
cat_cols = [f'{col}_t' for col in category_columns]

# indexed category column names
cat_str_indx = [f'{col}_Indx' for col in cat_cols]

# vectorized category column names
cat_vecs = [f'{col}v' for col in category_columns]

indexers = [StringIndexer(inputCol=c,
                          outputCol="{0}_Indx".format(c),
                          handleInvalid="keep")
            for c in cat_cols]

encoder = OneHotEncoderEstimator(inputCols=[indexer.getOutputCol() for indexer in indexers],
                                 outputCols=cat_vecs,
                                 dropLast=True)

assembler = VectorAssembler(inputCols=numeric_columns + encoder.getOutputCols(),
                            outputCol='features',
                            handleInvalid='keep')

label_indexer = StringIndexer(inputCol=target_col, outputCol='label')

start = time.time()
print(f'Running pipeline to create sparse vectors.. ')
pipeline = Pipeline(
    stages=indexers + [encoder] + [assembler] + [label_indexer])

model = pipeline.fit(df_pq)

transformed = model.transform(df_pq)

drop_cols = cat_str_indx + cat_vecs
final_df = transformed.drop(*drop_cols).cache()

time_taken = time.time() - start
print(f"... completed job in {time_taken} seconds")

Transforming Categorical column.. cat14
... completed job in 3.81636643409729 seconds
Transforming Categorical column.. cat15
... completed job in 3.001173734664917 seconds
Transforming Categorical column.. cat16
... completed job in 2.8838021755218506 seconds
Transforming Categorical column.. cat17
... completed job in 2.8992531299591064 seconds
Transforming Categorical column.. cat18
... completed job in 2.7315545082092285 seconds
Transforming Categorical column.. cat19
... completed job in 2.7182397842407227 seconds
Transforming Categorical column.. cat20
... completed job in 2.8201398849487305 seconds
Transforming Categorical column.. cat21
... completed job in 2.7374377250671387 seconds
Transforming Categorical column.. cat22
... completed job in 2.7233762741088867 seconds
Transforming Categorical column.. cat23
... completed job in 3.1336135864257812 seconds
Transforming Categorical column.. cat24
... completed job in 2.792780637741089 seconds
Transforming Categorical column.. ca

Write `final_df` data frame to parquet. This dataframe will be used in the algorithm analysis.

In [31]:
print('writing file to parquet')
final_df.write.parquet('data/ohe_data.parquet', compression='snappy', mode='overwrite')
print('Done.')

writing file to parquet
Done.


In [32]:
# Show first row of final_df
final_df.show(1)

+---+---+---+---+---+---+---+---+---+---+---+---+----+---+--------+--------+-----------+-----------+--------+--------+-----------+--------+--------+-----------+-----------+-----------+-----------+--------+-----------+-----------+--------+-----------+-----------+--------+-----------+--------+--------+-----------+--------+-----------+--------------------+-----+
|  y| n1| n2| n3| n4| n5| n6| n7| n8| n9|n10|n11| n12|n13| cat14_t| cat15_t|    cat16_t|    cat17_t| cat18_t| cat19_t|    cat20_t| cat21_t| cat22_t|    cat23_t|    cat24_t|    cat25_t|    cat26_t| cat27_t|    cat28_t|    cat29_t| cat30_t|    cat31_t|    cat32_t| cat33_t|    cat34_t| cat35_t| cat36_t|    cat37_t| cat38_t|    cat39_t|            features|label|
+---+---+---+---+---+---+---+---+---+---+---+---+----+---+--------+--------+-----------+-----------+--------+--------+-----------+--------+--------+-----------+-----------+-----------+-----------+--------+-----------+-----------+--------+-----------+-----------+--------+-----

Show the number of distinct categories amongst the transformed features

In [33]:
vars_transform = ['cat14_t', 'cat15_t', 'cat16_t', 'cat17_t', 
                  'cat18_t', 'cat19_t', 'cat20_t', 'cat21_t', 'cat22_t', 'cat23_t', 'cat23_t',
                  'cat24_t', 'cat25_t', 'cat26_t', 'cat27_t', 'cat28_t', 'cat29_t', 'cat30_t',
                  'cat31_t', 'cat32_t', 'cat33_t', 'cat34_t', 'cat35_t', 'cat36_t', 'cat37_t', 
                  'cat38_t', 'cat39_t']

In [34]:
for v in vars_transform:
    distinct_var_t = final_df.select(v).distinct().count()
    print('Distinct values in {} column: {}'.format(str(v),distinct_var_t))

Distinct values in cat14_t column: 10
Distinct values in cat15_t column: 20
Distinct values in cat16_t column: 2
Distinct values in cat17_t column: 7
Distinct values in cat18_t column: 7
Distinct values in cat19_t column: 7
Distinct values in cat20_t column: 3
Distinct values in cat21_t column: 8
Distinct values in cat22_t column: 3
Distinct values in cat23_t column: 3
Distinct values in cat23_t column: 3
Distinct values in cat24_t column: 6
Distinct values in cat25_t column: 3
Distinct values in cat26_t column: 8
Distinct values in cat27_t column: 9
Distinct values in cat28_t column: 2
Distinct values in cat29_t column: 3
Distinct values in cat30_t column: 10
Distinct values in cat31_t column: 10
Distinct values in cat32_t column: 3
Distinct values in cat33_t column: 4
Distinct values in cat34_t column: 3
Distinct values in cat35_t column: 3
Distinct values in cat36_t column: 9
Distinct values in cat37_t column: 13
Distinct values in cat38_t column: 10
Distinct values in cat39_t colum

Perform a series of checks on the transformations

In [37]:
# confirm if all the columns have a _t transformation
df_pq.cache()
df_pq.show(n=1)

+---+---+---+---+---+---+---+---+---+---+---+---+----+---+--------+--------+-----------+-----------+--------+--------+-----------+--------+--------+-----------+-----------+-----------+-----------+--------+-----------+-----------+--------+-----------+-----------+--------+-----------+--------+--------+-----------+--------+-----------+
|  y| n1| n2| n3| n4| n5| n6| n7| n8| n9|n10|n11| n12|n13| cat14_t| cat15_t|    cat16_t|    cat17_t| cat18_t| cat19_t|    cat20_t| cat21_t| cat22_t|    cat23_t|    cat24_t|    cat25_t|    cat26_t| cat27_t|    cat28_t|    cat29_t| cat30_t|    cat31_t|    cat32_t| cat33_t|    cat34_t| cat35_t| cat36_t|    cat37_t| cat38_t|    cat39_t|
+---+---+---+---+---+---+---+---+---+---+---+---+----+---+--------+--------+-----------+-----------+--------+--------+-----------+--------+--------+-----------+-----------+-----------+-----------+--------+-----------+-----------+--------+-----------+-----------+--------+-----------+--------+--------+-----------+--------+--------

Following the filtering of the features for a threshold number of counts, we observe that the number of distinct values has been reduced to just a few for each transformed feature.  

In [38]:
print('distinct categories in the TRANSFORMED cat17 column')
dist_cat17_t = df_pq.select("cat17_t").distinct().count()
print(dist_cat17_t)

distinct categories in the TRANSFORMED cat17 column
7


In [39]:
print(f'Only {dist_cat17_t/dist_cat17 * 100} % of the original distinct categories in the transformed cat17_t column')

Only 0.049354861453853205 % of the original distinct categories in the transformed cat17_t column


We can see that the transformed features contain just a small percent of the total number of distinct values of the original features below.

In [None]:
# for vt in vars_transform:
#     distinct_var_t = df_pq.select(vt).distinct().count()
#     unique_pct_original = (distinct_var_t/distinct_var) * 100
#     print('Only {} % of the original distinct categories in the transformed {} column'.format(unique_pct_original,str(vt)))

#### Create a new dataframe for algorithm by generating sparse vectors from the transformed features

In [41]:
cat_cols = ['n1_t', 'n2_t', 'n3_t', 'n4_t', 'n5_t', 'n6_t', 'n7_t',
           'n8_t', 'n9_t', 'n10_t', 'n11_t', 'n12_t',
           'n13_t','cat14_t', 'cat15_t', 'cat16_t', 'cat17_t',
           'cat18_t', 'cat19_t', 'cat20_t', 'cat21_t', 'cat22_t',
           'cat23_t', 'cat24_t', 'cat25_t', 'cat26_t', 'cat27_t',
           'cat28_t', 'cat29_t', 'cat30_t', 'cat31_t', 'cat32_t',
           'cat33_t', 'cat34_t', 'cat35_t', 'cat36_t', 'cat37_t',
           'cat38_t', 'cat39_t']

cat_str_indx = ['n1_t_Indx', 'n2_t_Indx', 'n3_t_Indx', 'n4_t_Indx', 'n5_t_Indx', 'n6_t_Indx', 'n7_t_Indx',
               'n8_t_Indx', 'n9_t_Indx', 'n10_t_Indx', 'n11_t_Indx', 'n12_t_Indx',
               'n13_t_Indx','cat14_t_Indx', 'cat15_t_Indx', 'cat16_t_Indx', 'cat17_t_Indx',
               'cat18_t_Indx', 'cat19_t_Indx', 'cat20_t_Indx', 'cat21_t_Indx', 'cat22_t_Indx',
               'cat23_t_Indx', 'cat24_t_Indx', 'cat25_t_Indx', 'cat26_t_Indx', 'cat27_t_Indx',
               'cat28_t_Indx', 'cat29_t_Indx', 'cat30_t_Indx', 'cat31_t_Indx', 'cat32_t_Indx',
               'cat33_t_Indx', 'cat34_t_Indx', 'cat35_t_Indx', 'cat36_t_Indx', 'cat37_t_Indx',
               'cat38_t_Indx', 'cat39_t_Indx']

cat_vecs = ['n1v', 'n2v', 'n3v', 'n4v', 'n5v', 'n6v', 'n7v',
           'n8v', 'n9v', 'n10v', 'n11v', 'n12v',
           'n13v','cat14v', 'cat15v', 'cat16v', 'cat17v',
           'cat18v', 'cat19v', 'cat20v', 'cat21v', 'cat22v',
           'cat23v', 'cat24v', 'cat25v', 'cat26v', 'cat27v',
           'cat28v', 'cat29v', 'cat30v', 'cat31v', 'cat32v',
           'cat33v', 'cat34v', 'cat35v', 'cat36v', 'cat37v',
           'cat38v', 'cat39v']

Write `final_df` data frame to parquet. This dataframe will be used in the algorithm analysis.

### Load files to GCP bucket and convert to RDDs for Spark analysis

The `train.txt` and `test.txt` files were downloaded to an external hard drive and subsequently loaded into a GCP bucket. 

In [None]:
# This command streams the main data set from dropbox directly to your GCP bucket - this may take a little time (RUN THIS CELL AS IS)
#!curl -L "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz" | gsutil cp - gs://{BUCKET}/finalproject/train.txt

In [None]:
# Do not run in the Docker container. This command puts a local file on GCP
#!gsutil cp 'train.txt' gs://{BUCKET}/finalproject/train.txt
#!gsutil cp 'train.txt' gs://danielalvarez_w261projects/finalproject/train.txt

In [None]:
# Do not run in the Docker container. This command puts a local file on GCP
#!gsutil cp 'test.txt' gs://{BUCKET}/finalproject/test.txt
#!gsutil cp 'test.txt' gs://danielalvarez_w261projects/finalproject/test.txt

In [None]:
# load the data into Spark RDDs for convenience of use later (RUN THIS CELL AS IS)
# trainRDD = sc.textFile(f'gs://danielalvarez_w261projects/finalproject/train.txt')
# testRDD = sc.textFile(f'gs://danielalvarez_w261projects/finalproject/test.txt')

In [None]:
# print the class
# print(type(trainRDD))
# print(type(testRDD))

In [None]:
# number of rows and shape of the files
# !cat trainRDD | wc -l

In [None]:
# convert to RDDs to DataFrames
#DF = trainRDD.map(lambda x: (x.split('\t')[0], ast.literal_eval(x.split('\t')[1]))).toDF()
# from pyspark.sql.types import Row

# #here you are going to create a function
# def f(x):
#     d = {}
#     for i in range(len(x)):
#         d[str(i)] = x[i]
#     return d

# #Now populate that
# df = trainRDD.map(lambda x: Row(**f(x))).toDF()