# W261 Final Project

#### *Anusha Munjuluri, Arvindh Ganesan, Kim Vignola, Christina Papadimitriou*

### Notebook Set-up

In [1]:
# imports
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pyspark.sql import types
import pyspark.sql
import pyspark.sql.functions

In [2]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [3]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "final_project"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

## 1. Question Formulation

## 2. EDA & Discussion of Challenges

### 2.1 Data Loading

In [4]:
# take a look at the data
!head -n 1 data/train.txt

0	1	1	5	0	1382	4	15	2	181	1	2		2	68fd1e64	80e26c9b	fb936136	7b4723c4	25c83c98	7e0ccccf	de7995b8	1f89b562	a73ee510	a8cd5504	b2cb9c98	37c9c164	2824a5f6	1adce6ef	8ba8b39a	891b62e7	e5ba7672	f54016b9	21ddcdc9	b1252a9d	07b5194c		3a171ecb	c5c50484	e8b83407	9727dd16


In [5]:
# load the data
fullTrainRDD = sc.textFile('data/train.txt')
testRDD = sc.textFile('data/test.txt')

FIELDS = ['I1','I2','I3','I4','I5','I6','I7','I8','I9','I10','I11','I12','I13',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','Label']

In [7]:
# number of rows in train/test data
print(f"Number of records in train data: {fullTrainRDD.count()} ...")
print(f"Number of records in test data: {testRDD.count()} ...")

Number of records in train data: 45840617 ...
Number of records in test data: 6042135 ...


### 2.2 Creating Train and Test Split

In [6]:
# Generate 80/20 (pseudo)random train/test split 
trainRDD, heldOutRDD = fullTrainRDD.randomSplit([0.8,0.2], seed = 1)
print(f"... held out {heldOutRDD.count()} records for evaluation and assigned {trainRDD.count()} for training.")

... held out 9167871 records for evaluation and assigned 36672746 for training.


### 2.3 Pre-Processing

In [7]:
# helper functions
def parse(line):
    """
    Map line --> tuple of (features, label)
    """
    fields = line.split('\t')
    features,label = fields[1:], fields[0]
    return(features, label)

def edit_data_types(line):
    """
    Map tuple of (features, label) --> tuple of (formated features, label)
    
    * '' is replaced with NaN
    * numerical fields are converted to integers
    * make label column numeric
    """
    features, label = line[0], line[1]
    formated_features = []
    for i, value in enumerate(features):
        if value == '':
            formated_features.append('null') # replaced the np.nan with 'null' because spark doesnt recognize np.nan when filtering
        else:
            if i < 13:
                formated_features.append(float(value)) 
            else:
                formated_features.append(value)
    return (formated_features, int(label))

In [8]:
# Parsing, making '' as np.nan and converting numerical features and output label to int
trainRDDCached = trainRDD.map(parse).map(edit_data_types).cache()

In [9]:
print(trainRDDCached.take(1))

[([1.0, 1.0, 5.0, 0.0, 1382.0, 4.0, 15.0, 2.0, 181.0, 1.0, 2.0, 'null', 2.0, '68fd1e64', '80e26c9b', 'fb936136', '7b4723c4', '25c83c98', '7e0ccccf', 'de7995b8', '1f89b562', 'a73ee510', 'a8cd5504', 'b2cb9c98', '37c9c164', '2824a5f6', '1adce6ef', '8ba8b39a', '891b62e7', 'e5ba7672', 'f54016b9', '21ddcdc9', 'b1252a9d', '07b5194c', 'null', '3a171ecb', 'c5c50484', 'e8b83407', '9727dd16'], 0)]


### 2.4 Creating a toy RDD

In [65]:
# creating toy sample (run this once from cmd line and reuse the same sample)
# !gshuf -n 1000 data/train.txt >> data/toy1000.txt

In [10]:
toyRDD = sc.textFile('data/toy1000.txt')
toyRDDCached = toyRDD.map(parse).map(edit_data_types).cache()

print(f"Number of records in toy data: {toyRDDCached.count()} ...")

Number of records in toy data: 1000 ...


In [11]:
print(toyRDDCached.take(1))

[([0.0, 478.0, 13.0, 'null', 3396.0, 194.0, 11.0, 13.0, 312.0, 0.0, 7.0, 'null', 'null', '05db9164', '207b2d81', '1757640a', '06148e59', '25c83c98', 'fbad5c96', 'f36791d8', '0b153874', 'a73ee510', 'c7009b63', '2714650d', '1a69f1c0', '9a88e2e2', '07d13a8f', '0c67c4ca', '8075af0c', 'e5ba7672', '395856b0', '21ddcdc9', 'b1252a9d', '8e4884c0', 'null', '423fab69', 'b936bfbe', '001f3601', 'f2fc1d6e'], 1)]


#### -------------------------------------------------------------------------------------------------------------------
#### !!!! For the below analysis `toyRDD` can be replaced by `trainRDDCached` to run the processing on the entire dataset
#### -------------------------------------------------------------------------------------------------------------------

### 2.5 Labels

In [12]:
# TOY DATA
# counting records for each class 
count_label_0 = toyRDDCached.filter(lambda x: x[1] == 0).count()
count_label_1 = toyRDDCached.filter(lambda x: x[1] == 1).count()
total = count_label_0 + count_label_1

print(f"{np.round(count_label_0/total*100, 2)} % of the records have label=0 and {np.round(count_label_1/total*100, 2)} % have label=1...")

72.3 % of the records have label=0 and 27.7 % have label=1...


In [13]:
# FULL DATA
# counting records for each class 
count_label_0 = trainRDDCached.filter(lambda x: x[1] == 0).count()
count_label_1 = trainRDDCached.filter(lambda x: x[1] == 1).count()
total = count_label_0 + count_label_1

print(f"{np.round(count_label_0/total*100, 2)} % of the records have label=0 and {np.round(count_label_1/total*100, 2)} % have label=1...")

74.38 % of the records have label=0 and 25.62 % have label=1...


_**Takeaway: labels are imbalanced, with 75% of records having label=0 (i.e. unclicked ads). However, we will not attempt to balance the labels at this stage. Being aware of this imbalance, we will carefully examine the prediction results to detect any bias (i.e. predicting always label=0)**_

### 2.6 Counting nulls in each column

In [13]:
def get_pct_nulls_in_column(dataRDD, var_position):
    """
    Counts the % nulls in a column 
    """

    null_count = dataRDD.map(lambda x: x[0][var_position]) \
                             .filter(lambda x: x == 'null').count()
    total_count = dataRDD.map(lambda x: x[0][var_position]).count()

    return null_count/total_count*100

In [14]:
# TOY DATA
for var_position, var in enumerate(FIELDS):

    if var_position < 39:
        null_pct = get_pct_nulls_in_column(toyRDDCached, var_position)
        print("FEATURE {}: {}% is null".format(var, np.round(null_pct,2)))

FEATURE I1: 43.2% is null
FEATURE I2: 0.0% is null
FEATURE I3: 21.3% is null
FEATURE I4: 22.1% is null
FEATURE I5: 2.4% is null
FEATURE I6: 23.0% is null
FEATURE I7: 3.7% is null
FEATURE I8: 0.0% is null
FEATURE I9: 3.7% is null
FEATURE I10: 43.2% is null
FEATURE I11: 3.7% is null
FEATURE I12: 77.4% is null
FEATURE I13: 22.1% is null
FEATURE C1: 0.0% is null
FEATURE C2: 0.0% is null
FEATURE C3: 3.4% is null
FEATURE C4: 3.4% is null
FEATURE C5: 0.0% is null
FEATURE C6: 13.4% is null
FEATURE C7: 0.0% is null
FEATURE C8: 0.0% is null
FEATURE C9: 0.0% is null
FEATURE C10: 0.0% is null
FEATURE C11: 0.0% is null
FEATURE C12: 3.4% is null
FEATURE C13: 0.0% is null
FEATURE C14: 0.0% is null
FEATURE C15: 0.0% is null
FEATURE C16: 3.4% is null
FEATURE C17: 0.0% is null
FEATURE C18: 0.0% is null
FEATURE C19: 42.2% is null
FEATURE C20: 42.2% is null
FEATURE C21: 3.4% is null
FEATURE C22: 74.5% is null
FEATURE C23: 0.0% is null
FEATURE C24: 3.4% is null
FEATURE C25: 42.2% is null
FEATURE C26: 42.2%

In [None]:
# FULL DATA
for var_position, var in enumerate(FIELDS):

    if var_position < 39:
        null_pct = get_pct_nulls_in_column(trainRDDCached, var_position)
        print("FEATURE {}: {}% is null".format(var, np.round(null_pct,2)))


_**Takeaway: some columns have a high % of null values. We could exclude columns that have more than 50% nulls because those columns will likely not contribute to the prediction results. However, since those variables with more than 50% missing values are categorical variables, the one-hot encoding approach that we will take later on will take care of those missing values**_

### 2.7 Numeric Features

#### 2.7.1 Get statistics and impute nulls with means/medians

In [15]:
def get_stats(dataRDD, var_position):
    """
    Get statistics for numeric variables 
    stats: mean, median, variance, min, max 
    """

    mean = dataRDD.map(lambda x: x[0][var_position]).filter(lambda x: x != 'null').mean() 
    variance = dataRDD.map(lambda x: x[0][var_position]).filter(lambda x: x != 'null').variance() 
    minimum = dataRDD.map(lambda x: x[0][var_position]).filter(lambda x: x != 'null').min() 
    maximum = dataRDD.map(lambda x: x[0][var_position]).filter(lambda x: x != 'null').max() 

    return mean, variance, minimum, maximum


def impute_nulls(line, mean_dict):
    """
    Impute the null values of the numerical columns with the mean value of the column
    """
    features, label = line[0], line[1]
    imputed_features = []
    for i, value in enumerate(features):
        if i < 13: 
            if value == 'null':
                imputed_features.append(mean_dict[i])
            else:
                imputed_features.append(value)
        else: 
            imputed_features.append(value)
    return (imputed_features, int(label))

In [16]:
# TOY DATA

# save the means in a dictionary
mean_dict_toy = {}
st_dev_dict_toy = {}

for var_position, var in enumerate(FIELDS):
    if var_position < 13:
        mean, variance, minimum, maximum = get_stats(toyRDDCached, var_position)
        print("FEATURE {}: \t mean={}, \t variance={}, \t min={}, \t max={}".format(var, np.round(mean, 2), np.round(variance, 2), minimum, maximum))
        mean_dict_toy[var_position] = mean
        st_dev_dict_toy[var_position] = np.sqrt(variance)

FEATURE I1: 	 mean=3.17, 	 variance=33.98, 	 min=0.0, 	 max=55.0
FEATURE I2: 	 mean=114.72, 	 variance=179230.84, 	 min=-2.0, 	 max=5123.0
FEATURE I3: 	 mean=18.78, 	 variance=2026.69, 	 min=0.0, 	 max=648.0
FEATURE I4: 	 mean=7.43, 	 variance=86.02, 	 min=0.0, 	 max=77.0
FEATURE I5: 	 mean=18392.77, 	 variance=4908735552.87, 	 min=0.0, 	 max=1002457.0
FEATURE I6: 	 mean=95.23, 	 variance=65007.31, 	 min=0.0, 	 max=4304.0
FEATURE I7: 	 mean=17.94, 	 variance=8794.35, 	 min=0.0, 	 max=2614.0
FEATURE I8: 	 mean=12.96, 	 variance=183.6, 	 min=0.0, 	 max=49.0
FEATURE I9: 	 mean=102.42, 	 variance=38152.34, 	 min=0.0, 	 max=2711.0
FEATURE I10: 	 mean=0.64, 	 variance=0.5, 	 min=0.0, 	 max=4.0
FEATURE I11: 	 mean=2.78, 	 variance=23.83, 	 min=0.0, 	 max=60.0
FEATURE I12: 	 mean=1.19, 	 variance=40.19, 	 min=0.0, 	 max=84.0
FEATURE I13: 	 mean=7.99, 	 variance=116.35, 	 min=0.0, 	 max=97.0


In [None]:
# FULL DATA

# save the means in a dictionary
mean_dict = {}
st_dev_dict = {}

for var_position, var in enumerate(FIELDS):
    if var_position < 13:
        mean, variance, minimum, maximum = get_stats(trainRDDCached, var_position)
        print("FEATURE {}: mean={}, variance={}, min={}, max={}".format(var, np.round(mean, 2), np.round(variance, 2), minimum, maximum))
        mean_dict[var_position] = mean
        st_dev_dict_toy[var_position] = np.sqrt(variance)

In [17]:
# imputing nulls with mean 

# TOY DATA
imputedToyRDDCached = toyRDDCached.map(lambda x: impute_nulls(x, mean_dict_toy)).cache()

In [None]:
# FULL DATA 
imputedTrainRDDCached = trainRDDCached.map(lambda x: impute_nulls(x, mean_dict)).cache()

***Takeaway: we impute the null values of the numerical columns with the mean (or median) of that column. No need to do that for the categorical features since one hot encoding will take care of the nulls.***

#### 2.7.2 Standardize features 

In [18]:
def standardize(line, mean_dict, st_dev_dict):
    """
    Scale and center data round mean of each feature (mean=0, sd=1)
    """
    features, label = line[0], line[1]
    formated_features = []
    for i, value in enumerate(features):
        if i < 13: 
            formated_features.append((value-mean_dict[i])/st_dev_dict[i])
        else: 
            formated_features.append(value)

    return (formated_features, label)

In [19]:
# TOY DATA 
normedToyRDDCached = imputedToyRDDCached.map(lambda x: standardize(x, mean_dict_toy, st_dev_dict_toy)).cache()

In [None]:
# FULL DATA 
normedRDDCached = imputedTrainRDDCached.map(lambda x: standardize(x, mean_dict, st_dev_dict)).cache()

***Takeaway: In the summary statistics, we notice that the numerical features have different ranges and thus we standardize our data (i.e. subtracting by the mean and dividing by the standard deviation of each column. Standardization would also help for a Logistic Regression algorithm to coverge faster.***

### 2.8 Categorical Features

In this section, we will perform data processing on the 26 categorical features of the dataset. We will start by performing some EDA to compute the **number of unique categories** within each categorical feature and the total counts for each category. 

In [77]:
def count_categories(dataRDD, var, var_position, top):
    """
    input: RDD, name and position of a categorical variable 
    
    output: 
    * number of unique categories in the variable
    * counts of each category occurance by label
    """
    
    # counting category occurance within each categorical feature 
    count_per_category = dataRDD.map(lambda x: ( x[0][var_position], 1)) \
                                           .reduceByKey(lambda x,y: x+y) \
                                           .sortBy(lambda x: -x[1])

    # counting number of unique values within the categorical variable
    num_unique_values = count_per_category.map(lambda x: x[0]).distinct().count()

    print('Unique values within the category:', num_unique_values)
    print(' ')
    top_x = count_per_category.take(top)
    print('Top {} categories by count:'.format(top))
    for i in top_x: 
        print('Category: {}; Count: {}'.format(i[0],i[1]))
    print(' ')

In [197]:
# TOY DATA
for var_position, var in enumerate(FIELDS):

    if var_position > 12 and var_position < 39:
        print(" ")
        print("VARIABLE {}".format(var))
        print(" ")
        count_categories(normedToyRDDCached, var, var_position=var_position, top=10)

 
VARIABLE C1
 
Unique values within the category: 57
 
Top 10 categories by count:
Category: 05db9164; Count: 485
Category: 68fd1e64; Count: 146
Category: 5a9ed9b0; Count: 103
Category: 8cf07265; Count: 51
Category: be589b51; Count: 41
Category: 5bfa8ab5; Count: 27
Category: f473b8dc; Count: 20
Category: 87552397; Count: 15
Category: 39af2607; Count: 13
Category: 9a89b36c; Count: 9
 
 
VARIABLE C2
 
Unique values within the category: 193
 
Top 10 categories by count:
Category: 38a947a1; Count: 114
Category: 1cfdf714; Count: 50
Category: 287130e0; Count: 46
Category: 38d50e09; Count: 46
Category: 207b2d81; Count: 37
Category: 09e68b86; Count: 33
Category: 421b43cd; Count: 33
Category: 4f25e98b; Count: 29
Category: 89ddfee8; Count: 27
Category: 58e67aaf; Count: 27
 
 
VARIABLE C3
 
Unique values within the category: 771
 
Top 10 categories by count:
Category: null; Count: 34
Category: d032c263; Count: 15
Category: 02cf9876; Count: 13
Category: b00d1501; Count: 12
Category: 77f2f2e5; Cou

The above analysis shows that some categorical variables have a high number of unique categories. Aditionally, the distribution of counts for most of the categorical variables is very skewed (i.e. some categories appear much more often than others). Considering this information, we decided to take the following approach to deal with categorical variables: 

* Bucket the categories within each categorical feature in 4 groups based on their occurence counts
    * **High frequency**: categories that occur more times than 10% of the total row count (Example: if the total row count is 1000 -> categories that occur *more than 100 times*)
    * **Medium frequency**: categories that occur more times than 5% and less than 10% of the total row count (Example: if the total row count is 1000 -> categories that occur *50-100 times*)
    * **Low frequency**: categories that occur less times than 5% of the total row count (Example: if the total row count is 1000 -> categories that occur *less than 50 times*)
    * **Missing**: null occurencies (note: since there are a couple of categorical variables with significant percentages of null occurencies, we wanted to retain this information to see if it potentially creates some signal for our models)

* Convert the categorical features to numerical using *One-hot Encoding* and the buckets obtained above. Specifically, we decided to keep all the one-hot encoded categories from the High Frequency bucket as separate columns, in order to obtain all signals from the categories that appear to be the most important for our classification problem. However, we decided to not discard the remaining categories/features, but instead add three additional columns one for each of the: Medium frequency, Low frequency and Missing buckets. If, for example, record X has any category that belongs to the Medium frequency bucket (based on the counts explained above), then the `Medium Frequency` column for that record will be `1`, otherwise it will be `0`. 

Below, we will demonstrate how the bucketing and one-hot encoding were applied in a scalable manner to our Criteo dataset, using our toy dataset with 1000 rows.

**Step 1**. Obtain the occurence counts for each category:

> *part a:* Using variable `C1` as an example to demonstrate the implementation. The analysis above showed that variable `C1` has 57 uniques categories, hence we will obtain the occurence counts for each of the 57 categories in `C1`. Each of these categories will then be placed in one of the 4 buckets mentioned above based on its occurence counts.

In [171]:
# C1 has var_position = 13
category_counts_C1 = normedToyRDDCached.map(lambda x: ( x[0][13], 1)) \
                                       .reduceByKey(lambda x,y: x+y) \
                                       .sortBy(lambda x: -x[1])

category_counts_C1.take(10)

[('05db9164', 485),
 ('68fd1e64', 146),
 ('5a9ed9b0', 103),
 ('8cf07265', 51),
 ('be589b51', 41),
 ('5bfa8ab5', 27),
 ('f473b8dc', 20),
 ('87552397', 15),
 ('39af2607', 13),
 ('9a89b36c', 9)]

> *part b:* Apply the same logic on all variables using Spark's `flatMap`

In [179]:
# helper function 
def count_cat_count(line):
    features, label = line[0], line[1]
    result = []
    for i in list(range(13,39)): 
        result.append((features[i], 1))
    return result

In [182]:
category_counts_all = normedToyRDDCached.flatMap(count_cat_count) \
                                        .reduceByKey(lambda x,y: x+y) \
                                        .sortBy(lambda x: -x[1]) \
                                        .filter(lambda x: x[0] != 'null') 
category_counts_all.take(10)

[('a73ee510', 900),
 ('25c83c98', 653),
 ('0b153874', 597),
 ('e5ba7672', 497),
 ('05db9164', 485),
 ('32c7478e', 447),
 ('7e0ccccf', 388),
 ('b28479f6', 367),
 ('21ddcdc9', 358),
 ('07d13a8f', 320)]

**Step 2**. Classify each category into one of the 4 buckets mentioned above based on its occurence counts and broadcast this information. As we learned throughout the semester, broadcasted variables are very useful in cases where the programmer wants to pass a copy of some useful information to every node in an efficient manner. 

* `>=` 100 times (i.e. 10% of 1000 rows) -> *High frequency* 
* 50-100 times (i.e. 5-10% of 1000 rows) -> *Medium frequency* 
* `<`50 times (i.e. 5% of 1000 rows) -> *Low frequency* 
* `==` 'null' -> *Missing*


In [185]:
# applying on C1 only 
high_frequency_categorices_C1 = sc.broadcast(category_counts_C1.filter(lambda x: x[1] >= 100) \
                                               .map(lambda x: x[0]).collect())
medium_frequency_categorices_C1 = sc.broadcast(category_counts_C1.filter(lambda x: x[1] < 100 and x[1] >= 50) \
                                               .map(lambda x: x[0]).collect())
low_frequency_categorices_C1 = sc.broadcast(category_counts_C1.filter(lambda x: x[1] < 50) \
                                               .map(lambda x: x[0]).collect())

# applying to all variables  
high_frequency_categorices = sc.broadcast(category_counts_all.filter(lambda x: x[1] >= 100) \
                                               .map(lambda x: x[0]).collect())
medium_frequency_categorices = sc.broadcast(category_counts_all.filter(lambda x: x[1] < 100 and x[1] >= 50) \
                                               .map(lambda x: x[0]).collect())
low_frequency_categorices = sc.broadcast(category_counts_all.filter(lambda x: x[1] < 50) \
                                               .map(lambda x: x[0]).collect())

In [186]:
# C1 
print('High frequency categories: {} \n'.format(high_frequency_categorices_C1.value))
print('Medium frequency categories: {} \n'.format(medium_frequency_categorices_C1.value))
print('Low frequency categories: {} \n'.format(low_frequency_categorices_C1.value))

High frequency categories: ['05db9164', '68fd1e64', '5a9ed9b0'] 

Medium frequency categories: ['8cf07265'] 

Low frequency categories: ['be589b51', '5bfa8ab5', 'f473b8dc', '87552397', '39af2607', '9a89b36c', 'ae82ea21', '241546e0', '09ca0b81', '17f69355', '439a44a4', '1464facd', 'fb174e6b', 'b455c6d7', '75ac2fe6', '45cb84c9', '28e55712', '7e5c2ff4', 'd4b08d58', 'da4eff0f', 'c974c00b', '42a16b9a', '3b65d647', 'fbc55dae', 'b19f768d', '2ebc17d3', '37d3940e', '5ebc3192', '3c9d8785', '9684fd4d', 'a14cf13a', '49807078', 'dac91c28', '439f942d', '41edac3d', '813d7135', '06584483', '291b7ba2', '40e1377d', 'e8ef605b', '394fc830', 'eb6dcae0', 'bfb430af', '5d7d2fe8', 'c79f9af8', '88abfaf6', '426610d2', '18988050', '0a16e1d4', '92fb1d87', 'c71ae391', 'abca0bad', '46300ee3'] 



In [188]:
# All variables 
print('High frequency categories: {} \n'.format(high_frequency_categorices.value))
print('Medium frequency categories: {} \n'.format(medium_frequency_categorices.value))
# print('Low frequency categories: {} \n'.format(low_frequency_categorices.value)) # this category is too large to print 

High frequency categories: ['a73ee510', '25c83c98', '0b153874', 'e5ba7672', '05db9164', '32c7478e', '7e0ccccf', 'b28479f6', '21ddcdc9', '07d13a8f', '3b08e48b', 'a458ea53', 'fbad5c96', '3a171ecb', 'b1252a9d', 'fe6b92e5', '5840adea', '4cf72387', '5b392875', '1adce6ef', '68fd1e64', 'ad3062eb', '001f3601', 'd4bb7bd8', '38a947a1', 'e8b83407', '423fab69', '07c540c4', '5a9ed9b0', '7cc72ec2'] 

Medium frequency categories: ['c9d4222a', 'ea9a246c', 'bcdee96c', '1f89b562', '3486227d', 'be7c41b4', '43b19349', '3fdb382b', '1e88c74f', '776ce399', '8cf07265', '1cfdf714', 'b34f3128'] 



**Step 3.** Applying a homegrown one-hot encoding implementation as explained above. 

In [195]:
# C1 example: the new columns of our dataset now are
FIELDS_NEW = ['I1','I2','I3','I4','I5','I6','I7','I8','I9','I10','I11','I12','I13', # numerical 
          '05db9164','68fd1e64','5a9ed9b0','Medium_Freq','Low_Freq','Missing',      # categorical 
          'Label']                                                                  

# all variables: the new columns of our dataset now are
FIELDS_NEW = ['I1','I2','I3','I4','I5','I6','I7','I8','I9','I10','I11','I12','I13',                                    # numerical 
          'high freq feature 1','high freq feature 2'...'high freq feature n','Medium_Freq','Low_Freq','Missing',      # categorical 
          'Label']                            

In [194]:
def OHE_transform(line):
    """
    One hot encoding transformation of an RDD 
    using the high/medium/low/missing logic
    
    returns: (ohe_transformed_features, label)
    """
    features, label = line[0], line[1]
    cat_features = []
    num_features = []
    for i, value in enumerate(features):
        if i > 12 and i < 39: 
            cat_features.append(value)
        else:
            num_features.append(value)

    high_freq_list = [1 if i in cat_features else 0 for i in high_frequency_categorices.value]
    medium_freq = 1 if any(i in cat_features for i in medium_frequency_categorices.value) else 0
    low_freq = 1 if any(i in cat_features for i in low_frequency_categorices.value) else 0
    missing = 1 if any(i in cat_features for i in ['null']) else 0
    ohe_features = high_freq_list + [medium_freq] + [low_freq] + [missing]

    return (num_features + ohe_features, label)

In [198]:
oheTrasformedToyRDDCached = normedToyRDDCached.map(OHE_transform).cache()

In [199]:
print(oheTrasformedToyRDDCached.take(1))

[([-0.5433608510385987, 0.8580968676474047, -0.12839483402657934, 0.0, -0.21404880541365456, 0.3873814615646774, -0.07403520683960009, 0.002878252461803471, 1.0729765767787602, -0.9112734540131763, 0.8646834019878579, 0.0, 0.0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1], 1)]


## 3. Algorithm Explanation

## 4. Algorithm Implementation

## 5. Application of Course Concepts