In [None]:
# set Java environment
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"



In [None]:
try:
    spark.stop()
except:
    pass # Ignore if 'spark' is not defined

In [None]:
# Import statemants
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, min, max, count, round, lit, when
from pyspark.sql.types import IntegerType




In [None]:
spark = SparkSession.builder.appName("MasterDataSet").config("spark.driver.memory", "8g").getOrCreate()

sc = spark.sparkContext

In [None]:
# Upload data
from google.colab import files
uploaded = files.upload()

Saving housingDataComplete.csv to housingDataComplete.csv


In [None]:
uploaded = files.upload()

Saving zip_rail_proximity_features.csv to zip_rail_proximity_features.csv


In [None]:
uploaded = files.upload()

Saving USA_ZIP_Codes.csv to USA_ZIP_Codes.csv


In [None]:
CBP_df = spark.read.csv("Combined_CBPData_2000-23.csv", header=True, inferSchema=True)
ziprail_df = spark.read.csv("zip_rail_proximity_features.csv", header=True, inferSchema=True)

In [None]:
CBP_df.printSchema()
ziprail_df.printSchema()

root
 |-- ZIP: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

root
 |-- ZIP: integer (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- StationName: string (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)



In [None]:
ziprail_df = ziprail_df.select(
    "ZIP",
    "Rail_Distance_Miles",
    "Adjacency_Indicator",
)

In [None]:
ziprail_df.printSchema()

root
 |-- ZIP: integer (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)



In [None]:
combined_df = CBP_df.join(
        ziprail_df,
        on="ZIP",
        how='inner'
    )

In [None]:
combined_df.printSchema()

root
 |-- ZIP: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)



In [None]:
combined_df.show(5)

+-----+------+------------------+----+--------+---------+--------------------+-------------------+
|  ZIP| NAICS|EstablishmentCount|Year|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|
+-----+------+------------------+----+--------+---------+--------------------+-------------------+
|85001|------|                49|2000| 33.4484| -112.074|0.023546974696654167|                  1|
|85001|23----|                 3|2000| 33.4484| -112.074|0.023546974696654167|                  1|
|85001|233210|                 1|2000| 33.4484| -112.074|0.023546974696654167|                  1|
|85001|233320|                 1|2000| 33.4484| -112.074|0.023546974696654167|                  1|
|85001|235610|                 1|2000| 33.4484| -112.074|0.023546974696654167|                  1|
+-----+------+------------------+----+--------+---------+--------------------+-------------------+
only showing top 5 rows



In [None]:
az_panel = combined_df.withColumn(
    "Post_Impl_Indicator",
    when(col("Year") >= 2018, 1).otherwise(0)
).withColumn(
    "COVID_Indicator",
    when(col("Year").isin([2020, 2021]), 1).otherwise(0)
).withColumn(
    "2008_Indicator",
    when(col("Year").isin([2008, 2009]), 1).otherwise(0)
)

In [None]:
az_panel.printSchema()

root
 |-- ZIP: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = false)
 |-- COVID_Indicator: integer (nullable = false)
 |-- 2008_Indicator: integer (nullable = false)



In [None]:
az_panel.show(5)

+-----+------+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+
|  ZIP| NAICS|EstablishmentCount|Year|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicator|2008_Indicator|
+-----+------+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+
|85001|------|                49|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|23----|                 3|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|233210|                 1|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|233320|                 1|2000| 33.4484| -112.074|0.023546974696654167|                  1|  

In [None]:
from pyspark.sql.functions import col, length, substring
condensed_panel_df = az_panel.filter(
    (col("NAICS") == "------") |
    (
        (length(col("NAICS")) == 6) &
        (substring(col("NAICS"), 3, 4) == "----")
    )
)


In [None]:
condensed_panel_df.show()

+-----+------+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+
|  ZIP| NAICS|EstablishmentCount|Year|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicator|2008_Indicator|
+-----+------+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+
|85001|------|                49|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|23----|                 3|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|31----|                 2|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|
|85001|42----|                 4|2000| 33.4484| -112.074|0.023546974696654167|                  1|  

In [None]:
cleaned_panel_df = condensed_panel_df.withColumn(
    "NAICS_Clean",
    when(col("NAICS") == "------", lit("Total"))
    .otherwise(substring(col("NAICS"), 1, 2))
)

In [None]:
final_cleaned_cbp_df = cleaned_panel_df.drop("NAICS").withColumnRenamed("NAICS_Clean", "NAICS")

In [None]:
final_cleaned_cbp_df.show()

+-----+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+
|  ZIP|EstablishmentCount|Year|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicator|2008_Indicator|NAICS|
+-----+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+
|85001|                49|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|Total|
|85001|                 3|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|   23|
|85001|                 2|2000| 33.4484| -112.074|0.023546974696654167|                  1|                  0|              0|             0|   31|
|85001|                 4|2000| 33.4484| -112.074|0.023546974696654167|                  1|               

In [None]:
from pyspark.sql.functions import col, length, substring, concat, lit, when

final_classified_cbp_df = final_cleaned_cbp_df.withColumn(
    "Establishment_Sector",
    # Handle the special 'Total' record first
    when(col("NAICS") == "Total", "Total_Establishments")
    # Now, handle the 2-digit NAICS codes
    .when(col("NAICS") == "11", "Ag_Forestry_Fishing")
    .when(col("NAICS") == "21", "Mining")
    .when(col("NAICS") == "22", "Utilities")
    .when(col("NAICS") == "23", "Construction")
    # Handle the Manufacturing range
    .when(col("NAICS").isin(["31", "32", "33"]), "Manufacturing")
    .when(col("NAICS") == "42", "Wholesale_Trade")
    # Handle the Retail Trade range
    .when(col("NAICS").isin(["44", "45"]), "etail_Trade")
    # Handle the Transportation range
    .when(col("NAICS").isin(["48", "49"]), "Transport_Warehousing")
    .when(col("NAICS") == "51", "Information")
    .when(col("NAICS") == "52", "Finance_Insurance")
    .when(col("NAICS") == "53", "Real_Estate_Leasing")
    .when(col("NAICS") == "54", "Professional_Services")
    .when(col("NAICS") == "55", "Management_Companies")
    .when(col("NAICS") == "56", "Admin_Waste_Support")
    .when(col("NAICS") == "61", "Educational_Services")
    .when(col("NAICS") == "62", "Health_Social_Assistance")
    .when(col("NAICS") == "71", "Arts_Entertainment_Rec")
    .when(col("NAICS") == "72", "Accommodation_Food_Services")
    .when(col("NAICS") == "81", "Other_Services")
    .when(col("NAICS") == "92", "Public_Administration")
    # Safety net for any unexpected 2-digit code
    .otherwise(concat(lit("99_Other_"), col("NAICS")))
)

In [None]:
final_classified_cbp_df.printSchema()
final_classified_cbp_df.show(5)

root
 |-- ZIP: integer (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = false)
 |-- COVID_Indicator: integer (nullable = false)
 |-- 2008_Indicator: integer (nullable = false)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)

+-----+------------------+----+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+--------------------+
|  ZIP|EstablishmentCount|Year|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicator|2008_Indicator|NAICS|Establishment_Sector|
+-----+------------------+----+--------+---------+--------------------+-------------------+----------------

In [None]:
housing_df = spark.read.csv("housingDataComplete.csv", header=True, inferSchema=True)

In [None]:
housing_df.printSchema()
housing_df.show(5)

root
 |-- ZIP: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Annual Change (%): double (nullable = true)
 |-- HPI: double (nullable = true)
 |-- HPI with 1990 base: double (nullable = true)
 |-- HPI with 2000 base: double (nullable = true)
 |-- Median_Value: double (nullable = true)

+-----+----+-----------------+------+------------------+------------------+------------------+
|  ZIP|Year|Annual Change (%)|   HPI|HPI with 1990 base|HPI with 2000 base|      Median_Value|
+-----+----+-----------------+------+------------------+------------------+------------------+
|85003|2000|             12.6|198.34|            211.88|             100.0| 146280.1475486276|
|85003|2001|            11.57|221.28|            236.39|            111.57|163394.25037429325|
|85003|2002|             4.58|231.43|            247.22|            116.68|174426.95779830744|
|85003|2003|             7.22|248.15|            265.08|            125.11| 189942.6267839241|
|85003|2004|            13.5

In [None]:
housing_df = housing_df.select("ZIP", "Year", "HPI", "Median_Value")

In [None]:
combined_df = (
    housing_df
    .join(final_classified_cbp_df, on=["ZIP", "Year"], how="full_outer")
)

In [None]:
combined_df.printSchema()
combined_df.show(5)

root
 |-- ZIP: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)

+-----+----+----+------------+------------------+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+--------------------+
|  ZIP|Year| HPI|Median_Value|EstablishmentCount|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicator|2008_Indicator|NAICS|Establishment_Sec

In [None]:
combined_df = combined_df.withColumn("ZIP", col("ZIP").cast("string"))
combined_df.printSchema()

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)



In [None]:
zip_area_df = spark.read.csv("USA_ZIP_Codes.csv", header=True, inferSchema=True)

In [None]:
zip_area_df.printSchema()
zip_area_df.show(5)

root
 |-- FID: integer (nullable = true)
 |-- ObjectID: integer (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- PO_NAME: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- POP2010: integer (nullable = true)
 |-- POP10_SQMI: double (nullable = true)
 |-- POP2012: integer (nullable = true)
 |-- POP12_SQMI: double (nullable = true)
 |-- WHITE: integer (nullable = true)
 |-- BLACK: integer (nullable = true)
 |-- AMERI_ES: integer (nullable = true)
 |-- ASIAN: integer (nullable = true)
 |-- HAWN_PI: integer (nullable = true)
 |-- HISPANIC: integer (nullable = true)
 |-- OTHER: integer (nullable = true)
 |-- MULT_RACE: integer (nullable = true)
 |-- MALES: integer (nullable = true)
 |-- FEMALES: integer (nullable = true)
 |-- AGE_UNDER5: integer (nullable = true)
 |-- AGE_5_9: integer (nullable = true)
 |-- AGE_10_14: integer (nullable = true)
 |-- AGE_15_19: integer (nullable = true)
 |-- AGE_20_24: integer (nullable = true)
 |-- AGE_25_34: integer (nullable = tru

In [None]:
zip_area_df = zip_area_df.select("ZIP", "SQMI")

In [None]:
zip_area_df.printSchema()
zip_area_df.show(5)

root
 |-- ZIP: integer (nullable = true)
 |-- SQMI: double (nullable = true)

+-----+-----+
|  ZIP| SQMI|
+-----+-----+
|99565| 12.5|
|73737|333.3|
|99648| 45.0|
|73739| 61.4|
|99661|461.0|
+-----+-----+
only showing top 5 rows



In [None]:
zip_area_df = zip_area_df.withColumn("ZIP", col("ZIP").cast("string"))
zip_area_df_cleaned = zip_area_df.na.drop(subset=["SQMI"])
zip_area_df.printSchema()

root
 |-- ZIP: string (nullable = true)
 |-- SQMI: double (nullable = true)



In [None]:
master_with_area_df = combined_df.join(
    zip_area_df,
    on="ZIP",
    how="left"
)

In [None]:
master_with_area_df.printSchema()
master_with_area_df.show(5)

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- SQMI: double (nullable = true)

+-----+----+----+------------+------------------+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+--------------------+----+
|  ZIP|Year| HPI|Median_Value|EstablishmentCount|latitude|longitude| Rail_Distance_Miles|Adjacency_Indicator|Post_Impl_Indicator|COVID_Indicato

In [None]:
master_with_density_df = master_with_area_df.withColumn(
    "Business_Density",
    when(col("SQMI") <= lit(0.001), lit(0.0))
    .otherwise(col("EstablishmentCount") / col("SQMI"))
)

In [None]:
from pyspark.sql.functions import log
final_master_df = master_with_density_df.withColumn(
    "Log_Business_Density",
    log(lit(1) + col("Business_Density"))
)

In [None]:
final_master_df.printSchema()
final_master_df.show(5)

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- SQMI: double (nullable = true)
 |-- Business_Density: double (nullable = true)
 |-- Log_Business_Density: double (nullable = true)

+-----+----+----+------------+------------------+--------+---------+--------------------+-------------------+-------------------+---------------+--------------+-----+--------------------+----+----------------+--------------------+
|  ZI

In [None]:
final_master_df_cleaned = final_master_df.withColumn(
    "Pre_Impl_Indicator",
    when(col("Year") <= 2007, 1).otherwise(0)
).withColumn(
    "Impl_phase_Indicator",
     when((col("Year") >= 2008) & (col("Year") <= 2018), 1).otherwise(0))


In [None]:
final_master_df_cleaned.printSchema()
final_master_df_cleaned.show(5)

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- SQMI: double (nullable = true)
 |-- Business_Density: double (nullable = true)
 |-- Log_Business_Density: double (nullable = true)
 |-- Pre_Impl_Indicator: integer (nullable = false)
 |-- Impl_phase_Indicator: integer (nullable = false)

+-----+----+----+------------+------------------+--------+---------+--------------------+-------------------+-------------------+-

# File write: change df

In [None]:
final_panel = final_master_df_cleaned.select("ZIP",
    "Year",
    "HPI",
    "Median_Value",
    "EstablishmentCount",
    "Establishment_Sector",
    "Business_Density",
    "Log_Business_Density",
    "Pre_Impl_Indicator",
    "Impl_phase_Indicator",
    "Post_Impl_Indicator",
    "COVID_Indicator",
    "2008_Indicator",
    "Adjacency_Indicator",
    "Rail_Distance_Miles"

)


In [None]:
final_panel.printSchema()
final_panel.show(5)

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- Business_Density: double (nullable = true)
 |-- Log_Business_Density: double (nullable = true)
 |-- Pre_Impl_Indicator: integer (nullable = false)
 |-- Impl_phase_Indicator: integer (nullable = false)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)

+-----+----+----+------------+------------------+--------------------+----------------+--------------------+------------------+--------------------+-------------------+---------------+--------------+-------------------+--------------------+
|  ZIP|Year| HPI|Median_Value|Establishment

In [None]:
final_panel= final_panel.withColumn(
    "Log_Median_Value",
    when(
        col("Median_Value").isNotNull() & (col("Median_Value") > lit(0)),
        log(col("Median_Value"))
    ).otherwise(lit(None))
)
final_panel = final_panel.withColumn(
    "Log_HPI",
    when(
        col("HPI").isNotNull() & (col("HPI") > lit(0)),
        log(col("HPI"))
    ).otherwise(lit(None))
)

In [None]:

final_panel.printSchema()
final_panel.show(5)

root
 |-- ZIP: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- Business_Density: double (nullable = true)
 |-- Log_Business_Density: double (nullable = true)
 |-- Pre_Impl_Indicator: integer (nullable = false)
 |-- Impl_phase_Indicator: integer (nullable = false)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- 2008_Indicator: integer (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Log_Median_Value: double (nullable = true)
 |-- Log_HPI: double (nullable = true)

+-----+----+----+------------+------------------+--------------------+----------------+--------------------+------------------+--------------------+-------------------+---------------+-------------

In [None]:
final_panel_cleaned = final_panel.filter(col("ZIP") >= 85003)

In [None]:
final_panel = final_panel_cleaned .select("ZIP",
    "Year",
    "Log_HPI",
    "Log_Median_Value",
    "EstablishmentCount",
    "Establishment_Sector",
    "Log_Business_Density",
    "Pre_Impl_Indicator",
    "Impl_phase_Indicator",
    "Post_Impl_Indicator",
    "COVID_Indicator",
    "2008_Indicator",
    "Adjacency_Indicator",
    "Rail_Distance_Miles"

)

In [None]:

# Exclude the Total_Establishments rows
final_panel = final_panel.filter(final_panel['Establishment_Sector'] != 'Total_Establishments')

In [None]:
# Fix Column Naming
final_panel = final_panel.withColumnRenamed('2008_Indicator', 'Crisis2008_Indicator')

In [None]:
# Ensure data types are correct
from pyspark.sql.types import IntegerType

final_panel = final_panel.withColumn('Year', final_panel['Year'].cast(IntegerType()))
final_panel = final_panel.withColumn('ZIP', final_panel['ZIP'].cast(IntegerType()))
final_panel = final_panel.withColumn('COVID_Indicator', final_panel['COVID_Indicator'].cast(IntegerType()))
final_panel = final_panel.withColumn('Crisis2008_Indicator', final_panel['Crisis2008_Indicator'].cast(IntegerType()))
final_panel = final_panel.withColumn('Adjacency_Indicator', final_panel['Adjacency_Indicator'].cast(IntegerType()))
final_panel = final_panel.withColumn('Post_Impl_Indicator', final_panel['Post_Impl_Indicator'].cast(IntegerType()))

In [None]:
from pyspark.sql.functions import regexp_replace

final_panel = final_panel.withColumn('Establishment_Sector', regexp_replace('Establishment_Sector', 'etail_Trade', 'Retail_Trade'))

In [None]:
final_panel.printSchema()
final_panel.show(5)

root
 |-- ZIP: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- HPI: double (nullable = true)
 |-- Median_Value: double (nullable = true)
 |-- EstablishmentCount: integer (nullable = true)
 |-- Establishment_Sector: string (nullable = true)
 |-- Business_Density: double (nullable = true)
 |-- Log_Business_Density: double (nullable = true)
 |-- Pre_Impl_Indicator: integer (nullable = false)
 |-- Impl_phase_Indicator: integer (nullable = false)
 |-- Post_Impl_Indicator: integer (nullable = true)
 |-- COVID_Indicator: integer (nullable = true)
 |-- Crisis2008_Indicator: integer (nullable = true)
 |-- Adjacency_Indicator: integer (nullable = true)
 |-- Rail_Distance_Miles: double (nullable = true)
 |-- Log_Median_Value: double (nullable = true)
 |-- Log_HPI: double (nullable = true)

+-----+----+----+------------+------------------+--------------------+----------------+--------------------+------------------+--------------------+-------------------+---------------+------

In [None]:
output_path = "Final_lighRail_dataset.csv"
final_panel.coalesce(1).write.csv(
    path=output_path,
    header=True,
    mode="overwrite"
)

In [None]:
spark.stop()