In [10]:
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
  .builder
  .appName("US_Accidents")
  .getOrCreate())

24/09/13 17:44:55 WARN Utils: Your hostname, Eileanors-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.0.110 instead (on interface en0)
24/09/13 17:44:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/13 17:44:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
schema = StructType([StructField('ID', StringType(), True),
                StructField('Source', StringType(), True),
                StructField('Severity', IntegerType(), True),
                StructField('Start_Time', TimestampType(), True),                  
                StructField('End_Time', TimestampType(), True),      
                StructField('Start_Lat', DecimalType(), True),
                StructField('Start_Lng', DecimalType(), True),
                StructField('End_Lat', DecimalType(), True),
                StructField('End_Lng', DecimalType(), True),       
                StructField('Distance(mi)', DecimalType(), True),       
                StructField('Description', StringType(), True),       
                StructField('Street', StringType(), True),                 
                StructField('City', StringType(), True),       
                StructField('County', StringType(), True),       
                StructField('State', StringType(), True),       
                StructField('Zipcode', StringType(), True),       
                StructField('Country', StringType(), True),       
                StructField('Timezone', StringType(), True),       
                StructField('Airport_Code', StringType(), True),
                StructField('Weather_Timestamp', TimestampType(), True),
                StructField('Temperature(F)', DecimalType(), True),
                StructField('Wind_Chill(F)', DecimalType(), True),
                StructField('Humidity(%)', DecimalType(), True),
                StructField('Pressure(in)', DecimalType(), True),
                StructField('Visibility(mi)', DecimalType(), True),
                StructField('Wind_Direction', StringType(), True),
                StructField('Wind_Speed(mph)', DecimalType(), True),
                StructField('Precipitation(in)', DecimalType(), True),
                StructField('Weather_Condition', StringType(), True),
                StructField('Amenity', BooleanType(), True),
                StructField('Bump', BooleanType(), True),
                StructField('Crossing', BooleanType(), True),
                StructField('Give_way', BooleanType(), True),
                StructField('Junction', BooleanType(), True),
                StructField('No_Exit', BooleanType(), True),
                StructField('Railway', BooleanType(), True), 
                StructField('Roundabout', BooleanType(), True),
                StructField('Station', BooleanType(), True),
                StructField('Stop', BooleanType(), True),
                StructField('Traffic_Calming', BooleanType(), True),
                StructField('Traffic_Signal', BooleanType(), True),
                StructField('Turning_Loop', BooleanType(), True),
                StructField('Sunrise_Sunset', StringType(), True),
                StructField('Civil_Twilight', StringType(), True),
                StructField('Nautical_Twilight', StringType(), True),
                StructField('Astronomical_Twilight', StringType(), True)])

# Use the DataFrameReader interface to read a CSV file
file = "US_Accidents_March23.csv"
df = spark.read.csv(file, header=True, schema=schema)

In [4]:
# Get row count
rows = df.count()
print(f"DataFrame Rows count : {rows}")

# Get columns count
cols = len(df.columns)
print(f"DataFrame Columns count : {cols}")



DataFrame Rows count : 7728394
DataFrame Columns count : 46


                                                                                

In [5]:
#Cleaning
#Might want to drop everything after - in zipcode if want to transform into an integer  (if closer zipcode values mean closer together)
#Might want to drop all NA's after determining that NA's are random
df_nona = df.na.drop()

# Get row count
rows = df_nona.count()
print(f"DataFrame Rows count : {rows}")

# Get columns count
cols = len(df_nona.columns)
print(f"DataFrame Columns count : {cols}")

24/09/13 17:45:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

DataFrame Rows count : 3554549
DataFrame Columns count : 46


                                                                                

In [14]:
df.summary().show()

                                                                                

+-------+--------+-------+------------------+------------------+------------------+-----------------+-----------------+-----------------+--------------------+------------------+----------+---------+-------+------------------+-------+----------+------------+------------------+------------------+------------------+------------------+------------------+--------------+-----------------+-------------------+------------------+--------------+--------------+-----------------+---------------------+
|summary|      ID| Source|          Severity|         Start_Lat|         Start_Lng|          End_Lat|          End_Lng|     Distance(mi)|         Description|            Street|      City|   County|  State|           Zipcode|Country|  Timezone|Airport_Code|    Temperature(F)|     Wind_Chill(F)|       Humidity(%)|      Pressure(in)|    Visibility(mi)|Wind_Direction|  Wind_Speed(mph)|  Precipitation(in)| Weather_Condition|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|
+-------+-

In [36]:
#Null counts (excluding time columns)
df.agg(*[
    (7728394-count(c)).alias(c)    # vertical (column-wise) operations in SQL ignore NULLs
    for c in df.columns 
]).show()



+---+------+--------+----------+--------+---------+---------+-------+-------+------------+-----------+------+----+------+-----+-------+-------+--------+------------+-----------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+
| ID|Source|Severity|Start_Time|End_Time|Start_Lat|Start_Lng|End_Lat|End_Lng|Distance(mi)|Description|Street|City|County|State|Zipcode|Country|Timezone|Airport_Code|Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity|Bump|Crossing|Give_way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twil

                                                                                