In [1]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Project-4").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/07 19:15:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Loading CSV from local to Spark DataFrame
df= spark.read.options(inferSchema="True",delimiter=",",header=True).csv("healthcare-dataset-stroke-data.csv")

df.show(truncate=False)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|id   |gender|age |hypertension|heart_disease|ever_married|work_type    |Residence_type|avg_glucose_level|bmi |smoking_status |stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|9046 |Male  |67.0|0           |1            |Yes         |Private      |Urban         |228.69           |36.6|formerly smoked|1     |
|51676|Female|61.0|0           |0            |Yes         |Self-employed|Rural         |202.21           |N/A |never smoked   |1     |
|31112|Male  |80.0|0           |1            |Yes         |Private      |Rural         |105.92           |32.5|never smoked   |1     |
|60182|Female|49.0|0           |0            |Yes         |Private      |Urban         |171.23           |34.4|smokes         |1     |
|1665 |Female|79.0|1           |0            |Yes      

In [4]:
df.count()

5110

In [5]:
df.columns

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [6]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [7]:
df = df.withColumn("bmi", col("bmi").cast(DoubleType()))
df2 = df.na.fill(value=28.89, subset=["bmi"])
df2.show()

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level|  bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69| 36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21|28.89|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92| 32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23| 34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|  

In [8]:
df2.columns

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [9]:
df2= df2.filter(df.gender!="Other")

In [10]:
df2 = df2.drop("id")
df2.show()

+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level|  bmi| smoking_status|stroke|
+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69| 36.6|formerly smoked|     1|
|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21|28.89|   never smoked|     1|
|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92| 32.5|   never smoked|     1|
|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23| 34.4|         smokes|     1|
|Female|79.0|           1|            0|         Yes|Self-employed|         Rural|        

In [11]:
df2 = df2.withColumn('work_type', regexp_replace('work_type', 'Never_worked', 'children'))

In [12]:
df2.select('work_type').distinct().collect()

[Row(work_type='Self-employed'),
 Row(work_type='Private'),
 Row(work_type='children'),
 Row(work_type='Govt_job')]

In [13]:
df2.show()

+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level|  bmi| smoking_status|stroke|
+------+----+------------+-------------+------------+-------------+--------------+-----------------+-----+---------------+------+
|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69| 36.6|formerly smoked|     1|
|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21|28.89|   never smoked|     1|
|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92| 32.5|   never smoked|     1|
|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23| 34.4|         smokes|     1|
|Female|79.0|           1|            0|         Yes|Self-employed|         Rural|        

In [14]:
csv_file_path = "post_clean.csv"
df2.write.option("header", True).option("delimiter",",").csv(csv_file_path)

In [15]:
#df_without_na = df.filter(df.bmi != "N/A")
#df_without_na.show()
#df_without_na.count()

In [16]:
#drop_id.write.csv("", header=True)

23/06/07 19:15:41 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
