# w261 Final Project - Clickthrough Rate Prediction


Team 2  
Danielle Adler, Craig Fujii, Conor Healy, YoungKoung Kim

Summer 2019

# Load txt File & Convert to Parquet

Uncomment code if on the cloud or on a VM.

1. Setup
1. Load `train.txt` into Spark
1. Convert and save data as `train.parquet`
1. Split `train.parquet` into `training_set.parquet` and `test_set.parquet`
1. Split `training_set.parquet` into `train` splits of 1k, 10k, 100k, 1MM & save as parquet
1. Loading in Data - both from `gsod bucket` and local `data` folder


In [7]:
import os

list_parquets_dirs = [ 'train_1_million', 'train_100000', 'train_10000', 'train_1000']
file_dirs = ['data/', 'gs://gsod_23456/'] # this doesn't work on Pyspark kernel; Currently deleting with the GCP commnand line

for i in file_dirs:    
    for j in list_parquets_dirs:
        directory_name = i + j + '.parquet'
        if os.path.exists(directory_name):
            print(directory_name, ': good')
        else:
            print(directory_name, ': MISSING!!!')

data/train_1_million.parquet : good
data/train_100000.parquet : good
data/train_10000.parquet : good
data/train_1000.parquet : good
gs://gsod_23456/train_1_million.parquet : MISSING!!!
gs://gsod_23456/train_100000.parquet : MISSING!!!
gs://gsod_23456/train_10000.parquet : MISSING!!!
gs://gsod_23456/train_1000.parquet : MISSING!!!


## Setup

In [1]:
# imports
import re
import ast
import time
import shutil
import os
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import display

from pyspark.sql import Window
from pyspark.sql.functions import col, desc, mean, isnan, when, count, isnull, rank, sum, countDistinct, avg, stddev, round

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import GeneralizedLinearRegression

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# # start Spark Session - only run when running local
# from pyspark.sql import SparkSession
# app_name = "final_project"
# master = "local[*]"
# spark = SparkSession\
#         .builder\
#         .appName(app_name)\
#         .master(master)\
#         .getOrCreate()
# sc = spark.sparkContext

In [4]:
spark

In [5]:
sc = spark.sparkContext

## Load `train.txt` into Spark

In [None]:
train = spark.read.load("gs://gsod_23456/train.txt",format="csv", sep="\t", inferSchema="true", header="false")

## Convert and save data as `train.parquet`

In [None]:
# convert main dataset to parquet for efficienty
# train.write.parquet("data/train.parquet")

In [None]:
train.write.parquet("gs://gsod_23456/data/train.parquet")

## Split `train.parquet` into `training_set.parquet` and `test_set.parquet`

In [8]:
parquetFile = spark.read.parquet("gs://gsod_23456/data/train.parquet")

In [None]:
# split data into train and test set 0.8 train, 0.2 test
splits = parquetFile.randomSplit([0.8,0.2], seed=2019)
training_set =  splits[0]
test_set = splits[1]

# all dataframes
type(parquetFile), type(training_set), type(test_set)

In [None]:
# convert main dataset to parquet for efficienty
# training_set.write.parquet("data/training_set.parquet")
# test_set.write.parquet("data/test_set.parquet")

In [None]:

training_set.write.parquet("gs://gsod_23456/data/training_set.parquet")
test_set.write.parquet("gs://gsod_23456/data/test_set.parquet")

## Split `training_set.parquet` into `train` splits of 1k, 10k, 100k, 1MM & save as parquet

In [None]:
# create toy sets to practice; Seed = 2019 to make the same dataset for us
# only have to run this command once
fraction = 1000000/48000000 
train_1_million = training_set.sample( False, fraction, 2019) # 1 million
train_100000 = training_set.sample( False, fraction/10, 2019) # 100,000
train_10000 = training_set.sample( False, fraction/100, 2019) # 10,000
train_1000 = training_set.sample( False, fraction/1000, 2019) # 1,000

In [None]:
#delete parquet directory if exists
list_parquets_dirs = [ 'train_1_million', 'train_100000', 'train_10000', 'train_1000']
for i in list_parquets_dirs:
    directory_name = 'data/' + i + '.parquet'
    if os.path.exists(directory_name):
        print('deleting', directory_name)
        shutil.rmtree(directory_name)

# use spark dataframe to write parquet files; this command creates a directory and the files within it
start = time.time()
train_1_million.write.parquet("gs://gsod_23456/data/train_1_million.parquet")
train_100000.write.parquet("gs://gsod_23456/data/train_100000.parquet")
train_10000.write.parquet("gs://gsod_23456/data/train_10000.parquet")
train_1000.write.parquet("gs://gsod_23456/data/train_1000.parquet")
print(f'... completed job in {time.time() - start} seconds.')

# start = time.time()
# train_1_million.write.parquet("data/train_1_million.parquet")
# train_100000.write.parquet("data/train_100000.parquet")
# train_10000.write.parquet("data/train_10000.parquet")
# train_1000.write.parquet("data/train_1000.parquet")
# print(f'... completed job in {time.time() - start} seconds.')

### Loading in Data - both from `gsod bucket` and local `data` folder

In [None]:
# reading the parquet files back into the Jupyter notebook
start = time.time()
train_1_million = spark.read.parquet("gs://gsod_23456/data/train_1_million.parquet")
train_100000 = spark.read.parquet("gs://gsod_23456/data/train_100000.parquet")
train_10000 = spark.read.parquet("gs://gsod_23456/data/train_10000.parquet")
train_1000 = spark.read.parquet("gs://gsod_23456/data/train_1000.parquet")
print(f'... completed job in {time.time() - start} seconds.')

In [6]:
# read parquet file
# will focus on 10,000 for most of our EDA
# start = time.time()
# train_1_million_df = spark.read.parquet("data/train_1_million.parquet")
# train_100000_df    = spark.read.parquet("data/train_100000.parquet")
# train_10000_df     = spark.read.parquet("data/train_10000.parquet")
# train_1000_df      = spark.read.parquet("data/train_1000.parquet")
# print(f'... completed job in {time.time() - start} seconds.')

... completed job in 3.667551040649414 seconds.
