## Learning PySpark

### Importing requirements

In [294]:
import os
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark import SparkFiles

import pyspark.sql.functions as F  ## similar to numpy for pandas
from pyspark.sql import types as T

%load_ext nb_black

<IPython.core.display.Javascript object>

### Starting a Spark Context 

In [4]:
sc = SparkContext("local","MyApplication")

### Stopping a Spark Context

In [3]:
sc.stop()

In [5]:
sc

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Basic Operations").getOrCreate()

<IPython.core.display.Javascript object>

In [288]:
spark

<IPython.core.display.Javascript object>

# PySpark Dataframe

### Creating a Dataframe from Scratch

In [295]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

<IPython.core.display.Javascript object>

In [296]:
schema = StructType(
    [
        StructField(name="city", dataType=StringType(), nullable=True),
        StructField(name="country", dataType=StringType(), nullable=True),
        StructField(name="counts", dataType=LongType(), nullable=False),
    ]
)

<IPython.core.display.Javascript object>

In [297]:
rows = [
    Row("LA", "US", 3),
    Row("New York", "US", 1),
    Row("London", "UK", 3),
]

<IPython.core.display.Javascript object>

In [298]:
parallelizeRows = spark.sparkContext.parallelize(rows)

<IPython.core.display.Javascript object>

In [299]:
df = spark.createDataFrame(parallelizeRows, schema)
df.show()

+--------+-------+------+
|    city|country|counts|
+--------+-------+------+
|      LA|     US|     3|
|New York|     US|     1|
|  London|     UK|     3|
+--------+-------+------+



<IPython.core.display.Javascript object>

### Reading from CSV

In [300]:
df2 = spark.read.csv("finaloutput.csv", inferSchema=True, header=True)

<IPython.core.display.Javascript object>

In [301]:
df2.createOrReplaceTempView("df2_table")

<IPython.core.display.Javascript object>

In [302]:
df2.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- T1A1_1: integer (nullable = true)
 |-- T1A1_2: integer (nullable = true)
 |-- T1B1_1: integer (nullable = true)
 |-- T1B1_2: integer (nullable = true)
 |-- T1B1_3: integer (nullable = true)
 |-- T1C1_1: double (nullable = true)
 |-- T1C1_2: integer (nullable = true)
 |-- T1D1_1: integer (nullable = true)
 |-- T1D1_2: integer (nullable = true)
 |-- T1D1_3: integer (nullable = true)
 |-- T1D1_4: double (nullable = true)
 |-- T1D1_5: integer (nullable = true)
 |-- T1E1_1: integer (nullable = true)
 |-- T1E1_2: integer (nullable = true)
 |-- T1E1_3: integer (nullable = true)
 |-- T1E1_4: double (nullable = true)
 |-- T1F1: integer (nullable = true)
 |-- T2A1_1: integer (nullable = true)
 |-- T2A1_2: integer (nullable = true)
 |-- T2A1_3: integer (nullable = true)
 |-- T2A1_4: integer (nullable = true)
 |-- T2A1_5: integer (nullable = true)
 |-- T2A1_6: double (nullable = true)
 |-- T2B1_1: integer (nullable = true)
 |-- T2B1_2: integer (nullabl

<IPython.core.display.Javascript object>

### Manipulating Columns

In [308]:
df.select("country", "city").show(2)

+-------+--------+
|country|    city|
+-------+--------+
|     US|      LA|
|     US|New York|
+-------+--------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [309]:
df.select(F.col("country")).show(2)

+-------+
|country|
+-------+
|     US|
|     US|
+-------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [305]:
df.select("country", "city").show(1)

+-------+----+
|country|city|
+-------+----+
|     US|  LA|
+-------+----+
only showing top 1 row



<IPython.core.display.Javascript object>

In [306]:
df.select(F.expr("Country as destination")).show(2)

+-----------+
|destination|
+-----------+
|         US|
|         US|
+-----------+
only showing top 2 rows



<IPython.core.display.Javascript object>

Changing the column name in an expression. Country -> Destination

In [311]:
df.select(F.expr("Country").alias("destination")).show(2)

+-----------+
|destination|
+-----------+
|         US|
|         US|
+-----------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [40]:
(df.selectExpr("Country as destination", "Country")).show()

+-----------+-------+
|destination|Country|
+-----------+-------+
|         US|     US|
|         US|     US|
|         UK|     UK|
+-----------+-------+



<IPython.core.display.Javascript object>

In [48]:
new_df = df.selectExpr("avg(counts)")

new_df.show()

+------------------+
|       avg(counts)|
+------------------+
|2.3333333333333335|
+------------------+



<IPython.core.display.Javascript object>

### Adding a new column to a dataframe


In [314]:
df = df.withColumn("New Column Name", F.lit(1))
df.show()

+--------+-------+------+---------------+
|    city|country|counts|New Column Name|
+--------+-------+------+---------------+
|      LA|     US|     3|              1|
|New York|     US|     1|              1|
|  London|     UK|     3|              1|
+--------+-------+------+---------------+



<IPython.core.display.Javascript object>

### Renaming a column

In [58]:
df = df.withColumnRenamed("New Column Name", "Changed Name")
df.show()

+--------+-------+------+------------+
|    city|country|counts|Changed Name|
+--------+-------+------+------------+
|      LA|     US|     3|           1|
|New York|     US|     1|           1|
|  London|     UK|     3|           1|
+--------+-------+------+------------+



<IPython.core.display.Javascript object>

### Removing Columns

In [59]:
df = df.drop("Changed Name")
df.show()

+--------+-------+------+
|    city|country|counts|
+--------+-------+------+
|      LA|     US|     3|
|New York|     US|     1|
|  London|     UK|     3|
+--------+-------+------+



<IPython.core.display.Javascript object>

### Dataframe filtering

In [60]:
df.filter(F.col("counts") < 2).show()

+--------+-------+------+
|    city|country|counts|
+--------+-------+------+
|New York|     US|     1|
+--------+-------+------+



<IPython.core.display.Javascript object>

In [62]:
df.where("counts>2").show(2)

+------+-------+------+
|  city|country|counts|
+------+-------+------+
|    LA|     US|     3|
|London|     UK|     3|
+------+-------+------+



<IPython.core.display.Javascript object>

In [64]:
df.where(F.col("counts") <= 1).where(F.col("country") == "US").show()

+--------+-------+------+
|    city|country|counts|
+--------+-------+------+
|New York|     US|     1|
+--------+-------+------+



<IPython.core.display.Javascript object>

### Get Distinct Rows

In [66]:
df.select("city").distinct().count()

3

<IPython.core.display.Javascript object>

### Get RandomSamples

In [67]:
df.sample(withReplacement=False, fraction=1.0, seed=5).count()

3

<IPython.core.display.Javascript object>

### Random Splits for train test split

In [74]:
df2 = df.randomSplit([0.67, 0.33], seed=5)
df2

[DataFrame[city: string, country: string, counts: bigint],
 DataFrame[city: string, country: string, counts: bigint]]

<IPython.core.display.Javascript object>

### Concatenating and Appending Rows

In [75]:
rows = [
    Row("Berlin", "Germany", 2),
    Row("Singapore", "Singapore", 1),
]
paralplelizeRows = spark.sparkContext.parallelize(rows)
df2 = spark.createDataFrame(rows, schema)

<IPython.core.display.Javascript object>

In [76]:
df3 = df.union(df2) ## Union of df with df2 
df3.show()

+---------+---------+------+
|     city|  country|counts|
+---------+---------+------+
|       LA|       US|     3|
| New York|       US|     1|
|   London|       UK|     3|
|   Berlin|  Germany|     2|
|Singapore|Singapore|     1|
+---------+---------+------+



<IPython.core.display.Javascript object>

### Sorting
Using either "sort" or "orderBy"

In [77]:
df3.sort("counts").show()

+---------+---------+------+
|     city|  country|counts|
+---------+---------+------+
| New York|       US|     1|
|Singapore|Singapore|     1|
|   Berlin|  Germany|     2|
|       LA|       US|     3|
|   London|       UK|     3|
+---------+---------+------+



<IPython.core.display.Javascript object>

In [79]:
df3.sort(F.desc("counts")).show()

+---------+---------+------+
|     city|  country|counts|
+---------+---------+------+
|       LA|       US|     3|
|   London|       UK|     3|
|   Berlin|  Germany|     2|
| New York|       US|     1|
|Singapore|Singapore|     1|
+---------+---------+------+



<IPython.core.display.Javascript object>

In [80]:
df3.orderBy(F.desc("counts")).show()

+---------+---------+------+
|     city|  country|counts|
+---------+---------+------+
|       LA|       US|     3|
|   London|       UK|     3|
|   Berlin|  Germany|     2|
| New York|       US|     1|
|Singapore|Singapore|     1|
+---------+---------+------+



<IPython.core.display.Javascript object>

# UFC Data

In [123]:
import os
from os.path import isfile, join

loc = os.path.abspath("")
data_loc = f"{loc}/data/ufcdata/"
data_files = [f for f in os.listdir(data_loc) if isfile(join(data_loc, f))]
print(data_files)

['raw_total_fight_data.csv', 'raw_fighter_details.csv', 'data.csv', 'preprocessed_data.csv']


<IPython.core.display.Javascript object>

In [124]:
data = {}
for file in data_files:
    data[file[:-4]] = spark.read.csv(f"{data_loc}{file}", inferSchema=True, header=True)
    data[file[:-4]].createOrReplaceTempView(f"data_{file[:-4]}")
print(data.keys())

dict_keys(['raw_total_fight_data', 'raw_fighter_details', 'data', 'preprocessed_data'])


<IPython.core.display.Javascript object>

In [125]:
data["data"] = (
    data["data"]
    .where(F.col("date") > "2017-01-01")
    .withColumn(
        "Winner_name",
        F.when(F.col("Winner") == "Red", F.col("R_fighter")).otherwise(
            F.col("B_fighter")
        ),
    )
    .withColumn(
        "Loser_name",
        F.when(F.col("Winner") == "Red", F.col("B_fighter")).otherwise(
            F.col("R_fighter")
        ),
    )
)

<IPython.core.display.Javascript object>

In [126]:
losses = (
    data["data"]
    .withColumnRenamed("Loser_name", "name")
    .groupBy("name")
    .count()
    .select("name", F.col("count").alias("losses"))
)

wins = (
    data["data"]
    .withColumnRenamed("Winner_name", "name")
    .groupBy("name")
    .count()
    .select("name", F.col("count").alias("wins"))
)

<IPython.core.display.Javascript object>

In [127]:
red = data["data"].select("R_fighter").distinct()
blue = data["data"].select("B_fighter").distinct()
check = red.union(blue).distinct().count()
print(check)

763


<IPython.core.display.Javascript object>

In [128]:
print(wins.distinct().count())
print(losses.distinct().count())

535
645


<IPython.core.display.Javascript object>

### Inner Join

In [129]:
win_loss_data = wins.join(losses, "name")
print(win_loss_data.distinct().count())

417


<IPython.core.display.Javascript object>

In [130]:
win_loss_data = wins.join(losses, "name", "left")
print(win_loss_data.distinct().count())

535


<IPython.core.display.Javascript object>

In [131]:
win_loss_data = wins.join(losses, "name", "right")
print(win_loss_data.distinct().count())

645


<IPython.core.display.Javascript object>

In [132]:
win_loss_data = wins.join(losses, "name", "outer")
print(win_loss_data.distinct().count())

763


<IPython.core.display.Javascript object>

In [118]:
win_loss_data.orderBy(F.desc("losses")).show(5)

+---------------+----+------+
|           name|wins|losses|
+---------------+----+------+
|Andrei Arlovski|   2|     6|
| Donald Cerrone|   4|     5|
|    Angela Hill|   3|     5|
|     Jim Miller|   2|     5|
|   Ben Saunders|   2|     5|
+---------------+----+------+
only showing top 5 rows



<IPython.core.display.Javascript object>

In [119]:
win_loss_data.where(F.col("name") == "Conor McGregor").show()

+--------------+----+------+
|          name|wins|losses|
+--------------+----+------+
|Conor McGregor|null|     1|
+--------------+----+------+



<IPython.core.display.Javascript object>

In [133]:
### Figuring out Age of Fighters
win_loss_data = win_loss_data.join(
    data["raw_fighter_details"].withColumnRenamed("fighter_name", "name"),
    "name",
    "left",
)

<IPython.core.display.Javascript object>

In [134]:
win_loss_data = (
    win_loss_data.withColumn("birthday", F.to_date(F.col("DOB"), "MMM dd, yyyy"))
    .withColumn(
        "age", F.round(F.months_between(F.current_date(), F.col("birthday")) / 12, 1)
    )
    .select("name", "wins", "losses", "age")
)

<IPython.core.display.Javascript object>

In [137]:
win_loss_data.show(5)

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|Abdul Razak Alhassan|   3|     1|34.7|
|  Abdul-Kerim Edilov|   1|  null|28.4|
|       Abel Trujillo|null|     2|36.6|
|         Abu Azaitar|   1|  null|34.1|
|       Adam Milstead|null|     3|32.8|
+--------------------+----+------+----+
only showing top 5 rows



<IPython.core.display.Javascript object>

In [139]:
win_loss_data = win_loss_data.join(
    data["raw_fighter_details"].withColumnRenamed("fighter_name", "name"),
    "name",
    "left",
)

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|Abdul Razak Alhassan|   3|     1|34.7|
|  Abdul-Kerim Edilov|   1|  null|28.4|
+--------------------+----+------+----+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [124]:
win_loss_data.where(F.col("losses").isNull()).orderBy(
    F.asc("age"), F.desc("wins")
).where("wins>5").show()

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|     Israel Adesanya|   6|  null|30.7|
|      Dustin Poirier|   6|  null|31.2|
|Alexander Volkano...|   6|  null|31.6|
|        Kamaru Usman|   6|  null|32.9|
+--------------------+----+------+----+



<IPython.core.display.Javascript object>

## Date and Timestamps

In [126]:
rows = [
    Row("2020-01-03"),
    Row("2020 01 10"),
    Row("2020 Jan 10"),
    Row("Sat, 11 Jan 2020"),
]

myrdd = spark.sparkContext.parallelize(rows)

schema = T.StructType(
    [T.StructField(name="date_str", dataType=T.StringType(), nullable=True)]
)

df = spark.createDataFrame(myrdd, schema)

<IPython.core.display.Javascript object>

In [127]:
df.show()

+----------------+
|        date_str|
+----------------+
|      2020-01-03|
|      2020 01 10|
|     2020 Jan 10|
|Sat, 11 Jan 2020|
+----------------+



<IPython.core.display.Javascript object>

In [36]:
rdd = sc.parallelize(range(100000000))

In [37]:
rdd.count()

100000000

In [3]:
conf = SparkConf().setAppName("PySpark Demo App").setMaster("local[2]")
conf.get("spark.master")

'local[2]'

In [4]:
conf.get("spark.app.name")

'PySpark Demo App'

In [6]:
df = spark.read.options(header=True,inferSchema=True).csv("finaloutput.csv")


In [15]:
import re, string

In [16]:
txt = sc.textFile("text.txt")

In [17]:
txt.take(5)

['This is a text file',
 'This is a text file',
 'This is a text file',
 'This is a text file',
 'This is a text fileThis is a text file']

In [18]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## PySpark ML Packages

### Binarizer 
Allows you to create an extra column that gives a 1.0 if it exceeds a predetermined threshold and a 0.0 if it falls below a threshold. 

In [249]:
from pyspark.ml.feature import Binarizer

<IPython.core.display.Javascript object>

In [250]:
df = win_loss_data
df.show(5)

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|Abdul Razak Alhassan|   3|     1|34.7|
|  Abdul-Kerim Edilov|   1|  null|28.4|
|       Abel Trujillo|null|     2|36.6|
|         Abu Azaitar|   1|  null|34.1|
|       Adam Milstead|null|     3|32.8|
+--------------------+----+------+----+
only showing top 5 rows



<IPython.core.display.Javascript object>

In [251]:
binarizer = Binarizer(threshold=30, inputCol="age", outputCol="binarized_age")
binarizer.transform(df).show(20)

+--------------------+----+------+----+-------------+
|                name|wins|losses| age|binarized_age|
+--------------------+----+------+----+-------------+
|Abdul Razak Alhassan|   3|     1|34.7|          1.0|
|  Abdul-Kerim Edilov|   1|  null|28.4|          0.0|
|       Abel Trujillo|null|     2|36.6|          1.0|
|         Abu Azaitar|   1|  null|34.1|          1.0|
|       Adam Milstead|null|     3|32.8|          1.0|
|      Adam Wieczorek|   2|     1|28.2|          0.0|
|        Adam Yandiev|null|     1|31.7|          1.0|
|     Adriano Martins|null|     1|37.3|          1.0|
|      Aiemann Zahabi|   1|     2|32.4|          1.0|
|         Al Iaquinta|   2|     2|33.0|          1.0|
|         Alan Jouban|   1|     3|38.4|          1.0|
|        Alan Patrick|   1|     1|36.8|          1.0|
|      Albert Morales|   1|     3|28.9|          0.0|
|        Alberto Mina|null|     1|38.0|          1.0|
|     Alejandro Perez|   4|     1|30.6|          1.0|
|    Aleksandar Rakic|   4| 

<IPython.core.display.Javascript object>

### StringIndexer
Function to provide a unique index for a particular column in question 

In [252]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="name", outputCol="name_index")
indexerModel = indexer.fit(df)
index = indexerModel.transform(df)
index.show(5)

+--------------------+----+------+----+----------+
|                name|wins|losses| age|name_index|
+--------------------+----+------+----+----------+
|Abdul Razak Alhassan|   3|     1|34.7|       0.0|
|  Abdul-Kerim Edilov|   1|  null|28.4|       1.0|
|       Abel Trujillo|null|     2|36.6|       2.0|
|         Abu Azaitar|   1|  null|34.1|       3.0|
|       Adam Milstead|null|     3|32.8|       4.0|
+--------------------+----+------+----+----------+
only showing top 5 rows



<IPython.core.display.Javascript object>

### One-Hot-Encoder

In [253]:
from pyspark.ml.feature import OneHotEncoder

<IPython.core.display.Javascript object>

In [254]:
encoder = OneHotEncoder(inputCols=["name_index"], outputCols=["encoded_name"])

encoderModel = encoder.fit(index)

encodedDF = encoderModel.transform(index)

encodedDF.show()

+--------------------+----+------+----+----------+----------------+
|                name|wins|losses| age|name_index|    encoded_name|
+--------------------+----+------+----+----------+----------------+
|Abdul Razak Alhassan|   3|     1|34.7|       0.0| (762,[0],[1.0])|
|  Abdul-Kerim Edilov|   1|  null|28.4|       1.0| (762,[1],[1.0])|
|       Abel Trujillo|null|     2|36.6|       2.0| (762,[2],[1.0])|
|         Abu Azaitar|   1|  null|34.1|       3.0| (762,[3],[1.0])|
|       Adam Milstead|null|     3|32.8|       4.0| (762,[4],[1.0])|
|      Adam Wieczorek|   2|     1|28.2|       5.0| (762,[5],[1.0])|
|        Adam Yandiev|null|     1|31.7|       6.0| (762,[6],[1.0])|
|     Adriano Martins|null|     1|37.3|       7.0| (762,[7],[1.0])|
|      Aiemann Zahabi|   1|     2|32.4|       8.0| (762,[8],[1.0])|
|         Al Iaquinta|   2|     2|33.0|       9.0| (762,[9],[1.0])|
|         Alan Jouban|   1|     3|38.4|      10.0|(762,[10],[1.0])|
|        Alan Patrick|   1|     1|36.8|      11.

<IPython.core.display.Javascript object>

# Pipeline
Fitting different estimators and transformers into one


In [156]:
from pyspark.ml import Pipeline

<IPython.core.display.Javascript object>

In [157]:
pipeline = Pipeline(stages=[binarizer, indexer])

<IPython.core.display.Javascript object>

In [158]:
pipelineModel = pipeline.fit(df)
transformeddf = pipelineModel.transform(df)
transformeddf.show()

+--------------------+----+------+----+-------------+----------+
|                name|wins|losses| age|binarized_age|name_index|
+--------------------+----+------+----+-------------+----------+
|Abdul Razak Alhassan|   3|     1|34.7|          1.0|       0.0|
|  Abdul-Kerim Edilov|   1|  null|28.4|          0.0|       1.0|
|       Abel Trujillo|null|     2|36.6|          1.0|       2.0|
|         Abu Azaitar|   1|  null|34.1|          1.0|       3.0|
|       Adam Milstead|null|     3|32.8|          1.0|       4.0|
|      Adam Wieczorek|   2|     1|28.2|          0.0|       5.0|
|        Adam Yandiev|null|     1|31.7|          1.0|       6.0|
|     Adriano Martins|null|     1|37.3|          1.0|       7.0|
|      Aiemann Zahabi|   1|     2|32.4|          1.0|       8.0|
|         Al Iaquinta|   2|     2|33.0|          1.0|       9.0|
|         Alan Jouban|   1|     3|38.4|          1.0|      10.0|
|        Alan Patrick|   1|     1|36.8|          1.0|      11.0|
|      Albert Morales|   

<IPython.core.display.Javascript object>

### Train/Test Split

In [265]:
seed = 42
(testDF, trainDF) = df.randomSplit((0.2, 0.8), seed=seed)

print(testDF.count(), trainDF.count())

154 609


<IPython.core.display.Javascript object>

In [266]:
df.show()

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|Abdul Razak Alhassan|   3|     1|34.7|
|  Abdul-Kerim Edilov|   1|  null|28.4|
|       Abel Trujillo|null|     2|36.6|
|         Abu Azaitar|   1|  null|34.1|
|       Adam Milstead|null|     3|32.8|
|      Adam Wieczorek|   2|     1|28.2|
|        Adam Yandiev|null|     1|31.7|
|     Adriano Martins|null|     1|37.3|
|      Aiemann Zahabi|   1|     2|32.4|
|         Al Iaquinta|   2|     2|33.0|
|         Alan Jouban|   1|     3|38.4|
|        Alan Patrick|   1|     1|36.8|
|      Albert Morales|   1|     3|28.9|
|        Alberto Mina|null|     1|38.0|
|     Alejandro Perez|   4|     1|30.6|
|    Aleksandar Rakic|   4|  null|28.2|
|   Aleksei Kunchenko|   2|  null|36.0|
|     Aleksei Oleinik|   4|     2|42.8|
|      Alen Amedovski|null|     1|32.0|
|    Alessandro Ricci|null|     1|37.7|
+--------------------+----+------+----+
only showing top 20 rows



<IPython.core.display.Javascript object>

In [276]:
df = df.na.drop()

<IPython.core.display.Javascript object>

In [277]:
df.show()

+--------------------+----+------+----+
|                name|wins|losses| age|
+--------------------+----+------+----+
|Abdul Razak Alhassan|   3|     1|34.7|
|      Adam Wieczorek|   2|     1|28.2|
|      Aiemann Zahabi|   1|     2|32.4|
|         Al Iaquinta|   2|     2|33.0|
|         Alan Jouban|   1|     3|38.4|
|        Alan Patrick|   1|     1|36.8|
|      Albert Morales|   1|     3|28.9|
|     Alejandro Perez|   4|     1|30.6|
|     Aleksei Oleinik|   4|     2|42.8|
|  Alessio Di Chirico|   2|     1|30.4|
|        Alex Caceres|   2|     3|31.8|
|         Alex Garcia|   1|     3|32.8|
|         Alex Morono|   3|     3|29.7|
|       Alex Oliveira|   4|     3|32.2|
|          Alex Perez|   4|     1|28.1|
|          Alex White|   2|     3|31.5|
|        Alexa Grasso|   2|     2|26.7|
|Alexander Gustafsson|   1|     2|33.3|
| Alexander Hernandez|   2|     1|27.5|
|    Alexander Volkov|   3|     1|31.5|
+--------------------+----+------+----+
only showing top 20 rows



<IPython.core.display.Javascript object>

In [278]:
featureassembler = VectorAssembler(inputCols=["losses", "age",], outputCol="features",)

output = featureassembler.transform(df)

<IPython.core.display.Javascript object>

In [279]:
output.show()

+--------------------+----+------+----+----------+
|                name|wins|losses| age|  features|
+--------------------+----+------+----+----------+
|Abdul Razak Alhassan|   3|     1|34.7|[1.0,34.7]|
|      Adam Wieczorek|   2|     1|28.2|[1.0,28.2]|
|      Aiemann Zahabi|   1|     2|32.4|[2.0,32.4]|
|         Al Iaquinta|   2|     2|33.0|[2.0,33.0]|
|         Alan Jouban|   1|     3|38.4|[3.0,38.4]|
|        Alan Patrick|   1|     1|36.8|[1.0,36.8]|
|      Albert Morales|   1|     3|28.9|[3.0,28.9]|
|     Alejandro Perez|   4|     1|30.6|[1.0,30.6]|
|     Aleksei Oleinik|   4|     2|42.8|[2.0,42.8]|
|  Alessio Di Chirico|   2|     1|30.4|[1.0,30.4]|
|        Alex Caceres|   2|     3|31.8|[3.0,31.8]|
|         Alex Garcia|   1|     3|32.8|[3.0,32.8]|
|         Alex Morono|   3|     3|29.7|[3.0,29.7]|
|       Alex Oliveira|   4|     3|32.2|[3.0,32.2]|
|          Alex Perez|   4|     1|28.1|[1.0,28.1]|
|          Alex White|   2|     3|31.5|[3.0,31.5]|
|        Alexa Grasso|   2|    

<IPython.core.display.Javascript object>

In [280]:
final_data = output.select("features", "wins")
final_data.show()

+----------+----+
|  features|wins|
+----------+----+
|[1.0,34.7]|   3|
|[1.0,28.2]|   2|
|[2.0,32.4]|   1|
|[2.0,33.0]|   2|
|[3.0,38.4]|   1|
|[1.0,36.8]|   1|
|[3.0,28.9]|   1|
|[1.0,30.6]|   4|
|[2.0,42.8]|   4|
|[1.0,30.4]|   2|
|[3.0,31.8]|   2|
|[3.0,32.8]|   1|
|[3.0,29.7]|   3|
|[3.0,32.2]|   4|
|[1.0,28.1]|   4|
|[3.0,31.5]|   2|
|[2.0,26.7]|   2|
|[2.0,33.3]|   1|
|[1.0,27.5]|   2|
|[1.0,31.5]|   3|
+----------+----+
only showing top 20 rows



<IPython.core.display.Javascript object>

In [283]:
(testDF, trainDF) = final_data.randomSplit((0.2, 0.8), seed=seed)

<IPython.core.display.Javascript object>

In [284]:
regressor = LinearRegression(featuresCol="features", labelCol="wins")
regressor = regressor.fit(trainDF)

<IPython.core.display.Javascript object>

In [285]:
pred_results = regressor.evaluate(testDF)
pred_results.predictions.show()

+----------+----+------------------+
|  features|wins|        prediction|
+----------+----+------------------+
|[1.0,25.1]|   1|2.1494544779285687|
|[1.0,26.8]|   2|2.1716017712195512|
|[1.0,27.3]|   1|2.1781156810110165|
|[1.0,27.8]|   1| 2.184629590802482|
|[1.0,28.3]|   1|2.1911435005939475|
|[1.0,28.3]|   2|2.1911435005939475|
|[1.0,29.2]|   2|2.2028685382185853|
|[1.0,29.2]|   6|2.2028685382185853|
|[1.0,30.0]|   1|  2.21329079388493|
|[1.0,30.2]|   2|2.2158963578015163|
|[1.0,30.3]|   3|2.2171991397598094|
|[1.0,30.4]|   1|2.2185019217181026|
|[1.0,30.4]|   2|2.2185019217181026|
|[1.0,30.5]|   3|2.2198047036763957|
|[1.0,31.3]|   3|2.2302269593427404|
|[1.0,31.7]|   1| 2.235438087175913|
|[1.0,31.8]|   2|2.2367408691342057|
|[1.0,32.6]|   2| 2.247163124800551|
|[1.0,32.8]|   4|2.2497686887171366|
|[1.0,32.8]|   5|2.2497686887171366|
+----------+----+------------------+
only showing top 20 rows



<IPython.core.display.Javascript object>

### Linear Regression with PySpark
Machine Learning with pySpark follows a slightly different train/test configuration than pandas/sklearn. Features used to train the module cannot be fed individually, but instead has to be merged into a single 1D array/vector.

In [244]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

<IPython.core.display.Javascript object>

In [237]:
df = spark.read.csv("Ecommerce_Customers.csv", header=True, inferSchema=True)

<IPython.core.display.Javascript object>

In [238]:
df.show(5)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

<IPython.core.display.Javascript object>

### Merging all of the features column into a single 1D Array

In [241]:
featureassembler = VectorAssembler(
    inputCols=[
        "Avg Session Length",
        "Time on App",
        "Time on Website",
        "Length of Membership",
    ],
    outputCol="Independent Features",
)

output = featureassembler.transform(df)

<IPython.core.display.Javascript object>

In [245]:
output.show(5)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

<IPython.core.display.Javascript object>

In [246]:
output.select("Independent Features").show(5)

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
+--------------------+
only showing top 5 rows



<IPython.core.display.Javascript object>

As we can see, the individual feature columns are merged into a 1D array / list

In [1]:
finalized_data = output.select("Independent Features", "Yearly Amount Spent")
finalized_data.show()

NameError: name 'output' is not defined

Features vs the label that it is trained to predict

### Train / Test Split

In [230]:
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])

<IPython.core.display.Javascript object>

### Creating a Linear Regression Model

In [231]:
regressor = LinearRegression(
    featuresCol="Independent Features", labelCol="Yearly Amount Spent"
)
regressor = regressor.fit(train_data)

<IPython.core.display.Javascript object>

### Retrieving the regression model coefficients and y-intercept

In [232]:
regressor.coefficients

DenseVector([25.521, 38.6568, 0.3783, 61.8635])

<IPython.core.display.Javascript object>

In [233]:
regressor.intercept

-1042.9805113206075

<IPython.core.display.Javascript object>

### View predicted results

In [236]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[31.04722214,11.1...|        392.4973992|388.03602576598996|
|[31.06621816,11.7...|        448.9332932| 462.2577453524136|
|[31.26064687,13.2...|        421.3266313| 421.9188460362777|
|[31.52575242,11.3...|        443.9656268| 449.7465603788896|
|[31.5261979,12.04...|        409.0945262|417.96921802610655|
|[31.76561882,12.4...|        496.5540816| 501.3181849586658|
|[31.80930032,11.6...|        536.7718994| 548.5990038657892|
|[31.8530748,12.14...|        459.2851235|461.69120557200813|
|[31.86483255,13.4...|        439.8912805|449.93670175405623|
|[31.97648006,10.7...|         330.594446| 325.0759942820059|
|[32.00475302,11.3...|        463.7459811|463.72724278272995|
|[32.01230077,12.1...|        492.9450531| 489.3389718937801|
|[32.04781463,12.4...|        497.3895578|480.85345254870754|
|[32.088

<IPython.core.display.Javascript object>