In [1]:
import requests
import json

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_timestamp

In [3]:
sc = SparkContext()
spark = SparkSession(sc).builder.appName("Test Session").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/22 19:01:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/22 19:01:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
# Stop the Spark session if no longer needed
spark.stop()

# Spark with latest jobs

In [6]:
sql_command = """
    SELECT
        scraped_at,
        company_name,
        job_title,
        location,
        job_page_url
    FROM jobs
    ORDER BY job_id DESC
    ;"""

api_url = "http://192.168.0.29:5000/execute_sql"
response = requests.post(api_url, json={"sql_command": sql_command})

if response.status_code == 200:
    resp = response.json()
else:
    print(f"Failed to fetch data: {response.status_code}, {response.text}")

In [7]:
resp.keys()

dict_keys(['columns', 'data'])

In [8]:
schema = resp['columns']
data = resp['data']
df = spark.createDataFrame(data, schema)
df.show(5)

                                                                                

+--------------------+------------+--------------------+--------------------+--------------------+
|          scraped_at|company_name|           job_title|            location|        job_page_url|
+--------------------+------------+--------------------+--------------------+--------------------+
|2024-06-22 12:44:...|    Morguard|Golf Controller (...|551 Powerline Rd,...|https://jobs.smar...|
|2024-06-22 12:42:...|       MCPC |     Network Analyst|Sandusky, OH 4487...|https://jobs.smar...|
|2024-06-22 12:42:...|Maven Clinic|Senior Member Gro...|        New York, NY|https://boards.gr...|
|2024-06-22 12:42:...|Maven Clinic|Senior Member Gro...|        New York, NY|https://boards.gr...|
|2024-06-22 12:41:...| Lingraphica|Care Partner Succ...|700 Alexander Par...|https://jobs.smar...|
+--------------------+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
df.describe()

DataFrame[summary: string, scraped_at: string, company_name: string, job_title: string, location: string, job_page_url: string]

In [12]:
from pyspark.sql.functions import col, to_timestamp
# Convert string to timestamp
df = df.withColumn("scraped_at", to_timestamp(col("scraped_at")))

In [15]:
df.describe()

DataFrame[summary: string, company_name: string, job_title: string, location: string, job_page_url: string]

In [14]:
df.show(5)

+--------------------+------------+--------------------+--------------------+--------------------+
|          scraped_at|company_name|           job_title|            location|        job_page_url|
+--------------------+------------+--------------------+--------------------+--------------------+
|2024-06-22 12:44:...|    Morguard|Golf Controller (...|551 Powerline Rd,...|https://jobs.smar...|
|2024-06-22 12:42:...|       MCPC |     Network Analyst|Sandusky, OH 4487...|https://jobs.smar...|
|2024-06-22 12:42:...|Maven Clinic|Senior Member Gro...|        New York, NY|https://boards.gr...|
|2024-06-22 12:42:...|Maven Clinic|Senior Member Gro...|        New York, NY|https://boards.gr...|
|2024-06-22 12:41:...| Lingraphica|Care Partner Succ...|700 Alexander Par...|https://jobs.smar...|
+--------------------+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [30]:
df.limit(5).toPandas()

Unnamed: 0,scraped_at,company_name,job_title,location,job_page_url
0,2024-06-22 12:44:57.577981,Morguard,Golf Controller (ClubLink),"551 Powerline Rd, Pompano Beach, FL 33069, USA",https://jobs.smartrecruiters.com/Morguard/7439...
1,2024-06-22 12:42:51.158376,MCPC,Network Analyst,"Sandusky, OH 44870, USA",https://jobs.smartrecruiters.com/MCPc/74399999...
2,2024-06-22 12:42:18.237898,Maven Clinic,Senior Member Growth Marketing Manager (Payer),"New York, NY",https://boards.greenhouse.io/mavenclinic/jobs/...
3,2024-06-22 12:42:18.048890,Maven Clinic,Senior Member Growth Marketing Associate (Payer),"New York, NY",https://boards.greenhouse.io/mavenclinic/jobs/...
4,2024-06-22 12:41:22.926955,Lingraphica,Care Partner Success Specialist - Bilingual En...,"700 Alexander Park Dr, Princeton, NJ 08540, USA",https://jobs.smartrecruiters.com/Lingraphica/7...


In [34]:
df.describe()

DataFrame[summary: string, company_name: string, job_title: string, location: string, job_page_url: string]

In [35]:
filtered_df = df.where(df["company_name"] == 'Experian')
filtered_df.show()

+--------------------+------------+--------------------+--------------------+--------------------+
|          scraped_at|company_name|           job_title|            location|        job_page_url|
+--------------------+------------+--------------------+--------------------+--------------------+
|2024-06-21 17:31:...|    Experian|Data Acquisition ...|955 American Lane...|https://jobs.smar...|
|2024-06-21 17:31:...|    Experian|Procurement Contr...|475 Anton Blvd, C...|https://jobs.smar...|
|2024-06-21 17:31:...|    Experian|Revenue Cycle Sol...|., ., ., United S...|https://jobs.smar...|
|2024-06-21 17:31:...|    Experian|Revenue Cycle Sol...|., ., ., United S...|https://jobs.smar...|
|2024-06-21 15:35:...|    Experian|Analista de Desen...|Avenida das Naçõe...|https://jobs.smar...|
|2024-06-21 15:35:...|    Experian|Human Resources G...|475 Anton Blvd, C...|https://jobs.smar...|
|2024-06-21 15:35:...|    Experian|Service Operation...|Bogotá- Carrera 7...|https://jobs.smar...|
|2024-06-2

In [38]:
filtered_df.count()

24/06/22 14:05:37 WARN TaskSetManager: Stage 19 contains a task of very large size (1236 KiB). The maximum recommended task size is 1000 KiB.


508

In [39]:
grouped_df = df.groupBy("company_name").count().orderBy("count", ascending=False)
grouped_df.show()

24/06/22 14:05:41 WARN TaskSetManager: Stage 22 contains a task of very large size (1236 KiB). The maximum recommended task size is 1000 KiB.


+-------------------+-----+
|       company_name|count|
+-------------------+-----+
|        Bosch Group| 5762|
|    Publicis Groupe| 4438|
|                   | 1170|
|       Sopra Steria| 1153|
|         ServiceNow| 1046|
|               Visa|  964|
|                ITW|  591|
| Palo Alto Networks|  522|
|         Epic Games|  520|
|   Sonic Automotive|  514|
|           Experian|  508|
|                OKX|  488|
|    Western Digital|  487|
|       NBCUniversal|  476|
|WNS Global Services|  465|
|              Capco|  429|
|               Hibu|  409|
|          JD Sports|  384|
|              Block|  353|
|            Celonis|  330|
+-------------------+-----+
only showing top 20 rows



In [41]:
grouped_df = df.groupBy("job_title").count().orderBy("count", ascending=False)
grouped_df.show()

24/06/22 14:06:11 WARN TaskSetManager: Stage 28 contains a task of very large size (1236 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----+
|           job_title|count|
+--------------------+-----+
|                    | 1170|
|Outside Sales Rep...|  401|
|     Sales Assistant|  144|
|   Account Executive|  122|
|Senior Customer R...|  116|
|Educational Sales...|  105|
|Senior Software E...|   68|
|   Software Engineer|   54|
|     Project Manager|   49|
|        Art Director|   49|
|  Dynamic PC Support|   48|
|Staff Software En...|   46|
|Customer Service ...|   45|
|Strategic Develop...|   44|
|     Account Manager|   44|
|  Service Technician|   43|
|Sales Development...|   37|
|          Copywriter|   36|
|Business Developm...|   35|
|Outside Sales Acc...|   34|
+--------------------+-----+
only showing top 20 rows



# Spark with Titanic

In [4]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("titanic.csv"))

In [5]:
df.show(5)

+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|ticket|    fare|  cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|    29|    0|    0| 24160|211.3375|     B5|       S|   2|NULL|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|113781|151.5500|C22 C26|       S|  11|NULL|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|     2|    1|    2|113781|151.5500|C22 C26|       S|NULL|NULL|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|    30|    1|    2|113781|151.5500|C22 C26|       S|NULL| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|    25|    1|    2|113781|151.5500|C22 C26|       S|

In [6]:
df.columns

['pclass',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body',
 'home.dest']

In [7]:
df.dtypes

[('pclass', 'string'),
 ('survived', 'string'),
 ('name', 'string'),
 ('sex', 'string'),
 ('age', 'string'),
 ('sibsp', 'string'),
 ('parch', 'string'),
 ('ticket', 'string'),
 ('fare', 'string'),
 ('cabin', 'string'),
 ('embarked', 'string'),
 ('boat', 'string'),
 ('body', 'string'),
 ('home.dest', 'string')]

In [8]:
dataset = df.select(
    col('survived').cast('float'),
    col('pclass').cast('float'),
    col('sex'),
    col('age').cast('float'),
    col('fare').cast('float'),
    col('embarked')
)
dataset.show()

+--------+------+------+------+--------+--------+
|survived|pclass|   sex|   age|    fare|embarked|
+--------+------+------+------+--------+--------+
|     1.0|   1.0|female|  29.0|211.3375|       S|
|     1.0|   1.0|  male|0.9167|  151.55|       S|
|     0.0|   1.0|female|   2.0|  151.55|       S|
|     0.0|   1.0|  male|  30.0|  151.55|       S|
|     0.0|   1.0|female|  25.0|  151.55|       S|
|     1.0|   1.0|  male|  48.0|   26.55|       S|
|     1.0|   1.0|female|  63.0| 77.9583|       S|
|     0.0|   1.0|  male|  39.0|     0.0|       S|
|     1.0|   1.0|female|  53.0| 51.4792|       S|
|     0.0|   1.0|  male|  71.0| 49.5042|       C|
|     0.0|   1.0|  male|  47.0| 227.525|       C|
|     1.0|   1.0|female|  18.0| 227.525|       C|
|     1.0|   1.0|female|  24.0|    69.3|       C|
|     1.0|   1.0|female|  26.0|   78.85|       S|
|     1.0|   1.0|  male|  80.0|    30.0|       S|
|     0.0|   1.0|  male|  NULL|  25.925|       S|
|     0.0|   1.0|  male|  24.0|247.5208|       C|


In [21]:
from pyspark.sql.functions import isnull, when, count, col

[count(when(isnull(c), 1)).alias(c) for c in dataset.columns]

[Column<'count(CASE WHEN (survived IS NULL) THEN 1 END) AS survived'>,
 Column<'count(CASE WHEN (pclass IS NULL) THEN 1 END) AS pclass'>,
 Column<'count(CASE WHEN (sex IS NULL) THEN 1 END) AS sex'>,
 Column<'count(CASE WHEN (age IS NULL) THEN 1 END) AS age'>,
 Column<'count(CASE WHEN (fare IS NULL) THEN 1 END) AS fare'>,
 Column<'count(CASE WHEN (embarked IS NULL) THEN 1 END) AS embarked'>]

In [9]:
dataset.columns

['survived', 'pclass', 'sex', 'age', 'fare', 'embarked']

In [10]:
type(dataset.columns)

list

In [17]:
count(when(isnull('embarked'), 'embarked')).alias('embarked')

Column<'count(CASE WHEN (embarked IS NULL) THEN embarked END) AS embarked'>

In [18]:
when(isnull('embarked'), 'embarked')

Column<'CASE WHEN (embarked IS NULL) THEN embarked END'>

In [None]:
count(when(isnull(), c)).alias(c)

In [11]:
dataset.select([count(when(isnull(c), 1)).alias(c+"_nulls") for c in dataset.columns]).show()

NameError: name 'count' is not defined

In [None]:
dataset.dropna(how='any').show()

+--------+------+------+------+--------+--------+
|survived|pclass|   sex|   age|    fare|embarked|
+--------+------+------+------+--------+--------+
|     1.0|   1.0|female|  29.0|211.3375|       S|
|     1.0|   1.0|  male|0.9167|  151.55|       S|
|     0.0|   1.0|female|   2.0|  151.55|       S|
|     0.0|   1.0|  male|  30.0|  151.55|       S|
|     0.0|   1.0|female|  25.0|  151.55|       S|
|     1.0|   1.0|  male|  48.0|   26.55|       S|
|     1.0|   1.0|female|  63.0| 77.9583|       S|
|     0.0|   1.0|  male|  39.0|     0.0|       S|
|     1.0|   1.0|female|  53.0| 51.4792|       S|
|     0.0|   1.0|  male|  71.0| 49.5042|       C|
|     0.0|   1.0|  male|  47.0| 227.525|       C|
|     1.0|   1.0|female|  18.0| 227.525|       C|
|     1.0|   1.0|female|  24.0|    69.3|       C|
|     1.0|   1.0|female|  26.0|   78.85|       S|
|     1.0|   1.0|  male|  80.0|    30.0|       S|
|     0.0|   1.0|  male|  24.0|247.5208|       C|
|     1.0|   1.0|female|  50.0|247.5208|       C|


In [12]:
dataset = dataset.dropna(how='any')

In [13]:
from pyspark.ml.feature import StringIndexer
dataset = StringIndexer(
    inputCol='sex',
    outputCol='gender',
    handleInvalid='keep').fit(dataset).transform(dataset)
dataset = StringIndexer(
    inputCol='embarked',
    outputCol='boarded',
    handleInvalid='keep').fit(dataset).transform(dataset)
dataset.show()

+--------+------+------+------+--------+--------+------+-------+
|survived|pclass|   sex|   age|    fare|embarked|gender|boarded|
+--------+------+------+------+--------+--------+------+-------+
|     1.0|   1.0|female|  29.0|211.3375|       S|   1.0|    0.0|
|     1.0|   1.0|  male|0.9167|  151.55|       S|   0.0|    0.0|
|     0.0|   1.0|female|   2.0|  151.55|       S|   1.0|    0.0|
|     0.0|   1.0|  male|  30.0|  151.55|       S|   0.0|    0.0|
|     0.0|   1.0|female|  25.0|  151.55|       S|   1.0|    0.0|
|     1.0|   1.0|  male|  48.0|   26.55|       S|   0.0|    0.0|
|     1.0|   1.0|female|  63.0| 77.9583|       S|   1.0|    0.0|
|     0.0|   1.0|  male|  39.0|     0.0|       S|   0.0|    0.0|
|     1.0|   1.0|female|  53.0| 51.4792|       S|   1.0|    0.0|
|     0.0|   1.0|  male|  71.0| 49.5042|       C|   0.0|    1.0|
|     0.0|   1.0|  male|  47.0| 227.525|       C|   0.0|    1.0|
|     1.0|   1.0|female|  18.0| 227.525|       C|   1.0|    1.0|
|     1.0|   1.0|female| 

In [14]:
dataset = dataset.drop('sex').drop('embarked')
dataset.show()

+--------+------+------+--------+------+-------+
|survived|pclass|   age|    fare|gender|boarded|
+--------+------+------+--------+------+-------+
|     1.0|   1.0|  29.0|211.3375|   1.0|    0.0|
|     1.0|   1.0|0.9167|  151.55|   0.0|    0.0|
|     0.0|   1.0|   2.0|  151.55|   1.0|    0.0|
|     0.0|   1.0|  30.0|  151.55|   0.0|    0.0|
|     0.0|   1.0|  25.0|  151.55|   1.0|    0.0|
|     1.0|   1.0|  48.0|   26.55|   0.0|    0.0|
|     1.0|   1.0|  63.0| 77.9583|   1.0|    0.0|
|     0.0|   1.0|  39.0|     0.0|   0.0|    0.0|
|     1.0|   1.0|  53.0| 51.4792|   1.0|    0.0|
|     0.0|   1.0|  71.0| 49.5042|   0.0|    1.0|
|     0.0|   1.0|  47.0| 227.525|   0.0|    1.0|
|     1.0|   1.0|  18.0| 227.525|   1.0|    1.0|
|     1.0|   1.0|  24.0|    69.3|   1.0|    1.0|
|     1.0|   1.0|  26.0|   78.85|   1.0|    0.0|
|     1.0|   1.0|  80.0|    30.0|   0.0|    0.0|
|     0.0|   1.0|  24.0|247.5208|   0.0|    1.0|
|     1.0|   1.0|  50.0|247.5208|   1.0|    1.0|
|     1.0|   1.0|  3

In [39]:
short = dataset.where(dataset['age']==39.0).limit(2)

In [40]:
required_features = [
    'pclass',
    'age',
    'fare',
    'gender',
    'boarded'
]
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

In [41]:
short_transformed_data = assembler.transform(short)

In [30]:
transformed_data.where(transformed_data['age']==39.0).limit(2).select('features').collect()

[Row(features=SparseVector(5, {0: 1.0, 1: 39.0})),
 Row(features=DenseVector([1.0, 39.0, 83.1583, 1.0, 1.0]))]

In [17]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [47]:
training_data.limit(5).toPandas()

Unnamed: 0,survived,pclass,age,fare,gender,boarded,features
0,0.0,1.0,2.0,151.550003,1.0,0.0,"[1.0, 2.0, 151.5500030517578, 1.0, 0.0]"
1,0.0,1.0,17.0,47.099998,0.0,0.0,"[1.0, 17.0, 47.099998474121094, 0.0, 0.0]"
2,0.0,1.0,18.0,108.900002,0.0,1.0,"[1.0, 18.0, 108.9000015258789, 0.0, 1.0]"
3,0.0,1.0,19.0,53.099998,0.0,0.0,"[1.0, 19.0, 53.099998474121094, 0.0, 0.0]"
4,0.0,1.0,19.0,263.0,0.0,0.0,"[1.0, 19.0, 263.0, 0.0, 0.0]"


In [48]:
test_data.limit(5).toPandas()

Unnamed: 0,survived,pclass,age,fare,gender,boarded,features
0,0.0,1.0,27.0,136.779205,0.0,1.0,"[1.0, 27.0, 136.77920532226562, 0.0, 1.0]"
1,0.0,1.0,28.0,47.099998,0.0,0.0,"[1.0, 28.0, 47.099998474121094, 0.0, 0.0]"
2,0.0,1.0,29.0,66.599998,0.0,0.0,"[1.0, 29.0, 66.5999984741211, 0.0, 0.0]"
3,0.0,1.0,31.0,50.4958,0.0,0.0,"[1.0, 31.0, 50.49580001831055, 0.0, 0.0]"
4,0.0,1.0,39.0,0.0,0.0,0.0,"(1.0, 39.0, 0.0, 0.0, 0.0)"


In [42]:
short_transformed_data.show()

+--------+------+----+-------+------+-------+--------------------+
|survived|pclass| age|   fare|gender|boarded|            features|
+--------+------+----+-------+------+-------+--------------------+
|     0.0|   1.0|39.0|    0.0|   0.0|    0.0|(5,[0,1],[1.0,39.0])|
|     1.0|   1.0|39.0|83.1583|   1.0|    1.0|[1.0,39.0,83.1583...|
+--------+------+----+-------+------+-------+--------------------+



In [43]:
short_transformed_data.toPandas()

Unnamed: 0,survived,pclass,age,fare,gender,boarded,features
0,0.0,1.0,39.0,0.0,0.0,0.0,"(1.0, 39.0, 0.0, 0.0, 0.0)"
1,1.0,1.0,39.0,83.158302,1.0,1.0,"[1.0, 39.0, 83.1583023071289, 1.0, 1.0]"


In [18]:
test_data.show()

+--------+------+----+--------+------+-------+--------------------+
|survived|pclass| age|    fare|gender|boarded|            features|
+--------+------+----+--------+------+-------+--------------------+
|     0.0|   1.0|27.0|136.7792|   0.0|    1.0|[1.0,27.0,136.779...|
|     0.0|   1.0|28.0|    47.1|   0.0|    0.0|[1.0,28.0,47.0999...|
|     0.0|   1.0|29.0|    66.6|   0.0|    0.0|[1.0,29.0,66.5999...|
|     0.0|   1.0|31.0| 50.4958|   0.0|    0.0|[1.0,31.0,50.4958...|
|     0.0|   1.0|39.0|     0.0|   0.0|    0.0|(5,[0,1],[1.0,39.0])|
|     0.0|   1.0|40.0| 27.7208|   0.0|    1.0|[1.0,40.0,27.7208...|
|     0.0|   1.0|41.0|    30.5|   0.0|    0.0|[1.0,41.0,30.5,0....|
|     0.0|   1.0|42.0|   26.55|   0.0|    0.0|[1.0,42.0,26.5499...|
|     0.0|   1.0|50.0| 28.7125|   1.0|    1.0|[1.0,50.0,28.7124...|
|     0.0|   1.0|54.0| 51.8625|   0.0|    0.0|[1.0,54.0,51.8624...|
|     0.0|   1.0|56.0| 30.6958|   0.0|    1.0|[1.0,56.0,30.6958...|
|     0.0|   1.0|57.0|164.8667|   0.0|    0.0|[1

In [44]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='survived', 
                            featuresCol='features',
                            maxDepth=5)

In [45]:
model = rf.fit(training_data)

In [49]:
predictions = model.transform(test_data)

In [62]:
predictions.columns

['survived',
 'pclass',
 'age',
 'fare',
 'gender',
 'boarded',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [61]:
pddf = predictions.sample(False, 0.1, seed=0).toPandas()


In [63]:
pddf['prob'] = pddf['probability'].apply(lambda x: x[1])

In [66]:
pddf[['prob', 'prediction', 'survived']]

Unnamed: 0,prob,prediction,survived
0,0.349235,0.0,0.0
1,0.204371,0.0,0.0
2,0.210871,0.0,0.0
3,0.142735,0.0,0.0
4,0.239361,0.0,0.0
5,0.145365,0.0,0.0
6,0.167293,0.0,0.0
7,0.139809,0.0,0.0
8,0.153344,0.0,0.0
9,0.449721,0.0,0.0


24/06/23 00:14:21 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 284569 ms exceeds timeout 120000 ms
24/06/23 00:14:21 WARN SparkContext: Killing executors is not supported by current scheduler.
24/06/23 00:14:26 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [51]:
# Evaluate our model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol='survived', 
    predictionCol='prediction', 
    metricName='accuracy')

In [52]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.7733990147783252
