**Starting Spark Session**

In [0]:
import pandas as pd

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ds_a24eliak').getOrCreate()

**Extracting data from Trafikverket's API**

In [0]:
#Importing necessary libraries for the API
import requests
import json

#Setting up the API key
api_key = 'API_KEY'

#Accessing the data from Trafikverket's API
url = 'https://api.trafikinfo.trafikverket.se/v2/data.json'

> TrainAnnouncement

In [0]:
# Defining the data from Trafikverket's API and filtering on only departures, the dates for the advertised time at the location, and the location signatures for the bigger train stations within western Sweden

xml = """
<REQUEST>
  <LOGIN authenticationkey = "API_KEY" />
  <QUERY objecttype="TrainAnnouncement" schemaversion="1.8" limit="20000">
    <FILTER>

      <EQ name="ActivityType" value="Avgang" />  

    <OR>
        <EQ name="LocationSignature" value="G" />
        <EQ name="LocationSignature" value="Bsc" />
        <EQ name="LocationSignature" value="F" />
        <EQ name="LocationSignature" value="Hr" />
        <EQ name="LocationSignature" value="Thn" />
        <EQ name="LocationSignature" value="Mdn" />
        <EQ name="LocationSignature" value="Mdö" />
        <EQ name="LocationSignature" value="A" />
        <EQ name="LocationSignature" value="Sk" />
        <EQ name="LocationSignature" value="Vg" />
        <EQ name="LocationSignature" value="Uv" />
      </OR>  

    </FILTER>

    <INCLUDE>AdvertisedTimeAtLocation</INCLUDE>
    <INCLUDE>EstimatedTimeAtLocation</INCLUDE>
    <INCLUDE>Canceled</INCLUDE>
    <INCLUDE>DepartureDateOTN</INCLUDE>
    <INCLUDE>Deviation</INCLUDE>
    <INCLUDE>FromLocation</INCLUDE>
    <INCLUDE>LocationSignature</INCLUDE>
    <INCLUDE>ModifiedTime</INCLUDE>
    <INCLUDE>PlannedEstimatedTimeAtLocation</INCLUDE>
    <INCLUDE>ProductInformation</INCLUDE>
    <INCLUDE>ScheduledDepartureDateTime</INCLUDE>
    <INCLUDE>TimeAtLocation</INCLUDE>
    <INCLUDE>TrainOwner</INCLUDE>

  </QUERY>
</REQUEST>
"""

In [0]:
#Defining the types, url, receiving the response 
headers = {'Content-Type': 'application/xml'}
url = 'https://api.trafikinfo.trafikverket.se/v2/data.json'

response = requests.post(url, data = xml, headers = headers)

data = response.json()

#Flattening the structure of JSON that is retrieved from the API and converting it to a dataframe in Pandas in order to view it better
df_pandas_1 = pd.json_normalize(data['RESPONSE']['RESULT'][0]['TrainAnnouncement'])

#Therafter converting the dataframe that exist in Pandas into a dataframe in Spark for processing in memory
dfspark_1 = spark.createDataFrame(df_pandas_1)

#Finally, displaying the Spark dataframe for the TrainAnnouncement dataset
dfspark_1.display()

> ReasonCode

In [0]:
# Retrieving the second part of the dataset that is needed to get the deviations, which is found in ReasonCode
xml2 = """
<REQUEST>
  <LOGIN authenticationkey = "API_KEY" />
  <QUERY objecttype="ReasonCode" schemaversion="1" limit="780">
    
    <INCLUDE>Code</INCLUDE>
    <INCLUDE>GroupDescription</INCLUDE>
    <INCLUDE>Level1Description</INCLUDE>
    <INCLUDE>Level2Description</INCLUDE>
    <INCLUDE>Level3Description</INCLUDE>
  
  </QUERY>
</REQUEST>
"""


In [0]:
#Defining the types, url, receiving the response
headers = {'Content-Type': 'application/xml'}
url = "https://api.trafikinfo.trafikverket.se/v2/data.json"

response = requests.post(url, data = xml2, headers = headers)
data = response.json()

#Flattening the retrieved structure of JSON to a dataframe in Pandas for ReasonCode
df_pandas_2 = pd.json_normalize(data['RESPONSE']['RESULT'][0]['ReasonCode'])

#Convert the existed dataframe in Pandas to a Spark dataframe for processing in memory 
dfspark_2 = spark.createDataFrame(df_pandas_2)

#Finally, displaying the Spark dataframe for the ReasonCode dataset
dfspark_2.display()

**Transforming the data**

In [0]:
#Flattening the nested columns that are arrays of objects existing in TrainAnnouncement and ReasonCode as a JSON structure
#The following columns are: Deviation, FromLocation, ProductInformation, ToLocation

from pyspark.sql.functions import col

dfspark = dfspark_1.withColumn('Deviation_Code', col('Deviation')[0]['Code']) \
                    .withColumn('Deviation_Description', col('Deviation')[0] ['Description']) \
                    \
                    .withColumn('FromLocation_LocationName', col('FromLocation')[0]['LocationName']) \
                    .withColumn('FromLocation_Order', col('FromLocation')[0]['Order']) \
                    .withColumn('FromLocation_Priority', col('FromLocation')[0]['Priority']) \
                    .withColumn('ProductInformation_Code', col('ProductInformation')[0]['Code']) \
                    .withColumn('ProductInformation_Description', col('ProductInformation')[0]['Description']) \
    
display(dfspark)

In [0]:
#Splitting the data format into separate columns defined as year, month, day number, and time of the day
#Following columns are: AdvertisedTimeAtLocation, AdvertisedTime, TimeAtLocation, ActtualTime

from pyspark.sql.functions import to_timestamp, year, month, dayofmonth, date_format 

dfspark = dfspark.withColumn('AdvertisedTime', to_timestamp('AdvertisedTimeAtLocation')) \
                 .withColumn('Adv_Year', year(col('AdvertisedTime'))) \
                 .withColumn('Adv_Month', month(col('AdvertisedTime'))) \
                 .withColumn('Adv_Day', dayofmonth(col('AdvertisedTime'))) \
                 .withColumn('Adv_TimeHour', date_format(col('AdvertisedTime'), 'HH:mm'))

dfspark = dfspark.withColumn('ActualTime', to_timestamp('TimeAtLocation')) \
                 .withColumn('Act_Year', year(col('ActualTime'))) \
                 .withColumn('Act_Month', month(col('ActualTime'))) \
                 .withColumn('Act_Day', dayofmonth(col('ActualTime'))) \
                 .withColumn('Act_TimeHour', date_format(col('ActualTime'), 'HH:mm'))    

In [0]:
#Joining both the first dataset of TrainAnnouncement and the second dataset ReasonCode by "Code" which is located in both datasets
#DeviationCode in TrainAnnouncement is Code in ReasonCode
dfspark = dfspark.join(dfspark_2, dfspark['Deviation_Code'] == dfspark_2['Code'], 'left')

In [0]:
#Removing the original nested properties/columns which have now been flattened
dfspark = dfspark.drop('Deviation', 'FromLocation', 'ProductInformation', 'Code', 'ScheduledDepartureDateTime', 'DepartureDateOTN') 

In [0]:
#Counting and separating the punctual trains and delayed trains and creating new columns, punctuality less or equal to 5
from pyspark.sql.functions import when, unix_timestamp

dfspark = dfspark.withColumn(
    'DelayMinutes',
    (unix_timestamp(col('ActualTime')) - unix_timestamp(col('AdvertisedTime'))) / 60
                            )

dfspark = dfspark.withColumn(
    'Punctuality',
    when(col('DelayMinutes') <= 5, 'Punctual').otherwise('Delayed')
                            )


In [0]:
dfspark.display()

In [0]:
#Removing unnecessary, and redundant columns

dfspark = dfspark.drop('AdvertisedTimeAtLocation', 'EstimatedTimeAtLocation', \
                       'FromLocation_Order', 'FromLocation_Priority', \
                        'ProductInformation_Code', 'ScheduledDepartureDateTime', \
                        'ModifiedTime', 'DepartureDateOTN', 'Level2Description', 'TimeAtLocation', 'FromLocation_LocationName')

## Exploratory Data Analysis

In [0]:
#Checking the schema of the dataset and their datatypes
dfspark.printSchema()

root
 |-- Canceled: boolean (nullable = true)
 |-- LocationSignature: string (nullable = true)
 |-- TrainOwner: string (nullable = true)
 |-- Deviation_Code: string (nullable = true)
 |-- Deviation_Description: string (nullable = true)
 |-- ProductInformation_Description: string (nullable = true)
 |-- AdvertisedTime: timestamp (nullable = true)
 |-- Adv_Year: integer (nullable = true)
 |-- Adv_Month: integer (nullable = true)
 |-- Adv_Day: integer (nullable = true)
 |-- Adv_TimeHour: string (nullable = true)
 |-- ActualTime: timestamp (nullable = true)
 |-- Act_Year: integer (nullable = true)
 |-- Act_Month: integer (nullable = true)
 |-- Act_Day: integer (nullable = true)
 |-- Act_TimeHour: string (nullable = true)
 |-- GroupDescription: string (nullable = true)
 |-- Level1Description: string (nullable = true)
 |-- Level3Description: string (nullable = true)
 |-- DelayMinutes: double (nullable = true)
 |-- Punctuality: string (nullable = false)



In [0]:
dfspark.columns

In [0]:
#Renaming columns into a better format with lowercases
dfspark = dfspark.withColumnRenamed('Canceled','canceled') \
                 .withColumnRenamed('LocationSignature','location') \
                 .withColumnRenamed('TrainOwner','train_owner') \
                 .withColumnRenamed('Deviation_Code','deviation_code') \
                 .withColumnRenamed('Deviation_Description','deviation_description') \
                 .withColumnRenamed('ProductInformation_Description','train') \
                 .withColumnRenamed('AdvertisedTime','advertised_time') \
                 .withColumnRenamed('Adv_Year','adv_year') \
                 .withColumnRenamed('Adv_Month','adv_month') \
                 .withColumnRenamed('Adv_Day','adv_day') \
                 .withColumnRenamed('Adv_TimeHour','adv_time_hour') \
                 .withColumnRenamed('ActualTime','actual_time') \
                 .withColumnRenamed('Act_Year','act_year') \
                 .withColumnRenamed('Act_Month','act_month') \
                 .withColumnRenamed('Act_Day','act_day') \
                 .withColumnRenamed('Act_TimeHour','act_time_hour') \
                 .withColumnRenamed('GroupDescription','group_description') \
                 .withColumnRenamed('Level1Description','level_description') \
                 .withColumnRenamed('Level3Description','deviation_information') \
                 .withColumnRenamed('DelayMinutes','delay_min') \
                 .withColumnRenamed('Punctuality','punctuality') 

In [0]:
dfspark.display()

In [0]:
#Dropping unnecessary columns
dfspark = dfspark.drop('level_description', 'group_description')

In [0]:
#Customizing the order for better and finalized view
dfspark = dfspark.select('advertised_time', 'actual_time', 'train_owner', \
                         'train', 'location', 'punctuality', 'delay_min', \
                         'canceled', 'deviation_code', 'deviation_description', 
                         'deviation_information', \
                         'adv_year', 'act_year', 'adv_month', 'act_month', \
                         'adv_day', 'act_day', 'adv_time_hour', 'act_time_hour')

In [0]:
dfspark.columns

['advertised_time',
 'actual_time',
 'train_owner',
 'train',
 'location',
 'punctuality',
 'delay_min',
 'canceled',
 'deviation_code',
 'deviation_description',
 'deviation_information',
 'adv_year',
 'act_year',
 'adv_month',
 'act_month',
 'adv_day',
 'act_day',
 'adv_time_hour',
 'act_time_hour']

In [0]:
dfspark.dtypes

In [0]:
#Changing the location signatures/train stations into their actual names
dfspark = dfspark.withColumn('location',
                             when(col('location') == 'A', 'Alingsas')\
                            .when(col('location') == 'Bsc', 'Boras')\
                            .when(col('location') == 'F', 'Falkoping')\
                            .when(col('location') == 'G', 'Gothenburg')\
                            .when(col('location') == 'Mdn', 'Molndal')\
                            .when(col('location') == 'Sk', 'Skovde')\
                            .when(col('location') == 'Thn', 'Trollhattan')\
                            .when(col('location') == 'Uv', 'Uddevalla')\
                            .when(col('location') == 'Vg', 'Vanersborg'))

In [0]:
#Checking and counting the null values that exist within the dataset
from pyspark.sql.functions import sum

dfspark.select([sum(col(c).isNull()\
        .cast('int')).alias(c)\
        for c in dfspark.columns]).display()

In [0]:
#Imputing missing values that has null as below
#Fill the actual time values wih the the advertised time values
#Fill the train owner with "Other"
#Fill the train as "Other"
#Fill the delay min as 6 if its punctuality is delayed, and 5 if its punctuality is punctual
#Fill the deviation code, description, and information with "None" since there is no deviation
#Fill actual year with the advertised, month as when the data is retrieved, actual day as advertised day, and actual time hour as the advertised time hour

from pyspark.sql.functions import coalesce, lit, when

dfspark = (
    dfspark.withColumn('actual_time', coalesce(col('actual_time'),\
                        col('advertised_time')))\
            .withColumn('train_owner', coalesce(col('train_owner'),\
                                                lit('Other')))\
            .withColumn('train', coalesce(col('train'), lit('Other')))\
            .withColumn('delay_min', coalesce(\
                    when(col('delay_min').isNull() & \
                        (col('punctuality') == 'Delayed'), lit(6))\
                    .when(col('delay_min').isNull() & \
                        (col('punctuality') == 'Punctual'),\
                                    lit(5)), col('delay_min')))\
            .withColumn('deviation_code', coalesce(col('deviation_code'), lit('None')))\
            .withColumn('deviation_description', coalesce(col('deviation_description'), lit('None')))\
            .withColumn('deviation_information', coalesce(col('deviation_information'), lit('None')))\
            .withColumn('act_year', coalesce(col('act_year'),\
                    col('adv_year')))\
            .withColumn('act_month', coalesce(col('act_month'),\
                    col('adv_month')))\
            .withColumn('act_day', coalesce(col('act_day'),\
                    col('adv_day')))\
            .withColumn('act_time_hour', coalesce(col       ('act_time_hour'), col('adv_time_hour')))\
            .withColumn('location', coalesce(col('location'), lit('Other')))
        )
        

In [0]:
#Checking and counting the null values again in order to see if they have been replaced
from pyspark.sql.functions import sum

dfspark.select([sum(col(c).isNull()\
        .cast('int')).alias(c)\
        for c in dfspark.columns]).display()

In [0]:
dfspark.display()

In [0]:
#Checking the trains that were canceled or not in the train stations
dfspark.groupBy('location','canceled').count().display()

In [0]:
#Checking the different trains that are operating and driving in western Sweden and counting the amount they appear
dfspark.groupBy('train').count().display()

In [0]:
#Checking the trains that are actuaally on time (punctual) or delayed (late)
dfspark.groupBy('location', 'punctuality').count().display()

In [0]:
#Checking and comparing the trains to see who which train was most punctual
dfspark.groupBy('train', 'punctuality').count()\
        .where(col('punctuality') == 'Punctual').display()

In [0]:
#Checking the average delay for every station
dfspark.groupBy('location').avg('delay_min').display()

In [0]:
#Checking how many trains that were on time or at maximum, 5 minutes delayed which still counts as punctual 
dfspark.filter(col('delay_min') <= 5).count()

2353

In [0]:
#Checking how many trains that were delayed more than 5 minutes which is not punctual
dfspark.filter(col('delay_min') > 5).count()

16036

In [0]:
#Checking the average delay in minutes for the trains
from pyspark.sql.functions import avg

dfspark.select(avg('delay_min').alias('avg_delay_in_minutes')).show()

In [0]:
#Looking through the different deviations
dfspark.groupBy('deviation_description').count().show()

In [0]:
#Overview of the punctual trains
dfspark.filter(col('punctuality').isin('Punctual')).display()

In [0]:
#Overview of the delayed trains
dfspark.filter(col('punctuality').isin('Delayed')).display()

In [0]:
#Looks like it might be class imbalance within the data since there are more delays than punctual trains, controlling if that is the case
dfspark.groupBy('punctuality').count().display()

In [0]:
#Handling the imbalance by assigning some weights to the different classes with known values from before when exploring the data

dfspark = dfspark.withColumn('weighted_class',\
                         when(col('punctuality') == 'Delayed', lit(1))\
                         .when(col('punctuality') == 'Punctual', lit(16036 / 2353))
                            )

In [0]:
dfspark.groupBy('weighted_class').count().display()

**Pre-process the data before modeling**

In [0]:
#Dropping some columns that are not needed for feeding the data into the machine learning models
dfspark = dfspark.drop('actual_time', 'advertised_time', 'train',\
                        'adv_year', 'adv_month',\
                        'act_year', 'act_month', 'act_day', 'act_time_hour',\
                        'delay_min')

In [0]:
dfspark.columns

['train_owner',
 'location',
 'punctuality',
 'canceled',
 'deviation_code',
 'deviation_description',
 'deviation_information',
 'adv_day',
 'adv_time_hour',
 'weighted_class']

In [0]:
dfspark.printSchema()

>Environment changes, switching pre-processing technique to Scikit-learn

In [0]:
#Converting the Spark dataframe into Pandas dataframe since environment issues was faced
new_pandas = dfspark.toPandas()

In [0]:
#Dropping deviation description and information since it contains several values that are unique
new_pandas = new_pandas.drop(columns = ['deviation_description', 'deviation_information'])

In [0]:
new_pandas.head(10)

**Encoding categorical features**

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

In [0]:
#Encoding the punctuality column into label_punctuality
enc_lab = LabelEncoder()
new_pandas['label_punctuality'] = enc_lab.fit_transform(new_pandas['punctuality'])

In [0]:
#Delayed as 0, and punctual as 1 within the current dataset

In [0]:
#Dropping the original punctuality column
new_pandas = new_pandas.drop(columns = ['punctuality'])

In [0]:
enc_onh = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')

In [0]:
#Converting the categorical columns that exist currently
col_categorical = ['train_owner', 'location', 'deviation_code']

enc_onh = OneHotEncoder(handle_unknown = 'ignore')

#Fitting the columns and thereafter transforming them
enc_array = enc_onh.fit_transform(new_pandas[col_categorical])

#Retrieve the new names for the encoded columns
col_names = enc_onh.get_feature_names_out(col_categorical)

#Converting it to a dataframe in Pandas
encd_df = pd.DataFrame(enc_array.toarray(), columns = col_names, index = new_pandas.index)

#Merging and dropping the old categoricals and adding the new encoded
new_pandas = pd.concat([new_pandas.drop(columns = col_categorical), encd_df], axis=1)


In [0]:
#Converting canceled trains into 0 and 1 where true is becoming 1 and false is becoming 0
new_pandas['canceled_number'] = new_pandas['canceled'].astype(int)

In [0]:
#Dropping the previous column that was a boolean
new_pandas = new_pandas.drop(columns = ['canceled'])

In [0]:
#Since the adv_time_hour is an object and not a numeric value, that one has to also be converted

new_pandas['adv_time_hour_numerical'] = new_pandas['adv_time_hour']\
    .str.split(':').apply(lambda x: int(x[0]) + int(x[1])/60)

In [0]:
#Dropping the old column that is not going to be used
new_pandas = new_pandas.drop(columns = ['adv_time_hour'])

In [0]:
#Converting back to Spark
dfspark_x = spark.createDataFrame(new_pandas)

In [0]:
dfspark_x.display()

adv_day,weighted_class,label_punctuality,train_owner_MTRX,train_owner_Other,train_owner_SJ,train_owner_TÅGAB,train_owner_VASTTRAF,train_owner_VY,train_owner_Ö-TÅG,location_Alingsas,location_Boras,location_Falkoping,location_Gothenburg,location_Molndal,location_Other,location_Skovde,location_Trollhattan,location_Uddevalla,location_Vanersborg,deviation_code_ANA003,deviation_code_ANA005,deviation_code_ANA006,deviation_code_ANA007,deviation_code_ANA011,deviation_code_ANA016,deviation_code_ANA018,deviation_code_ANA023,deviation_code_ANA027,deviation_code_ANA028,deviation_code_ANA030,deviation_code_ANA031,deviation_code_ANA034,deviation_code_ANA044,deviation_code_ANA050,deviation_code_ANA055,deviation_code_ANA063,deviation_code_ANA064,deviation_code_ANA065,deviation_code_ANA071,deviation_code_ANA073,deviation_code_ANA074,deviation_code_ANA083,deviation_code_ANA090,deviation_code_ANA274,deviation_code_None,canceled_number,adv_time_hour_numerical
7,1.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,22.05
7,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,22.716666666666665
7,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,22.966666666666665
8,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2.716666666666667
8,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,3.1
8,6.815129621759456,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,3.2
8,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.4833333333333334
8,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.716666666666667
8,6.815129621759456,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,3.9
8,6.815129621759456,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,3.95


>Continuing the pre-processing and modeling in a local Spark and Jupyter Notebook environment since issues was faced during the implementation in Databricks

In [0]:
#Stopping the Spark session for the project
spark.stop()