# Data Processing Using PySpark

In [1]:
from pyspark.sql import SparkSession
Spark = SparkSession.builder.appName("Demonstration").getOrCreate()

In [2]:
Spark

In [3]:
df = Spark.read.csv("Dataset/stroke.csv",header=True,inferSchema=True)
print("Total Records: {}\nTotal Columns: {}".format(df.count(), len(df.columns)))

Total Records: 5110
Total Columns: 12


In [4]:
df.show(5)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [5]:
from pyspark.sql.functions import col,isnan, when, count

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+
| id|gender|age|hypertension|heart_disease|ever_married|work_type|Residence_type|avg_glucose_level|bmi|smoking_status|stroke|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+
|  0|     0|  0|           0|            0|           0|        0|             0|                0|  0|             0|     0|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+



In [6]:
value = str(round(df.agg({'bmi':'mean'}).collect()[0][0],2))
df = df.na.replace('N/A', value,'bmi')

from pyspark.sql.types import IntegerType
df = df.withColumn("bmi", df["bmi"].cast(IntegerType()))

df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: integer (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [7]:
columns = [x[0] for x in df.dtypes if (x[1] == 'string')]

print("**********\n")

for x in columns:
    print(x)
    print([i[x] for i in df.select(x).distinct().collect()])
    print("\n**********\n")

**********

gender
['Female', 'Other', 'Male']

**********

ever_married
['No', 'Yes']

**********

work_type
['Never_worked', 'Self-employed', 'Private', 'children', 'Govt_job']

**********

Residence_type
['Urban', 'Rural']

**********

smoking_status
['smokes', 'Unknown', 'never smoked', 'formerly smoked']

**********



In [8]:
value = [df.groupby("gender").count().orderBy("count", ascending=False).first()[0]][0]
df = df.na.replace('Other', value,'gender')

value = [df.groupby("smoking_status").count().orderBy("count", ascending=False).first()[0]][0]
df = df.na.replace('Unknown', value,'smoking_status')

In [9]:
columns = ['summary'] + [x[0] for x in df.dtypes if (x[1] != 'string') and (len(df.select(x[0]).distinct().collect()) != 2)]
columns.remove('id')

data_summary = df.describe().select(columns)

data_summary.show()

+-------+------------------+------------------+-----------------+
|summary|               age| avg_glucose_level|              bmi|
+-------+------------------+------------------+-----------------+
|  count|              5110|              5110|             5110|
|   mean|43.226614481409015|106.14767710371804|28.43091976516634|
| stddev| 22.61264672311348| 45.28356015058193|7.688482598387539|
|    min|              0.08|             55.12|               10|
|    max|              82.0|            271.74|               97|
+-------+------------------+------------------+-----------------+



In [10]:
columns = [x[0] for x in df.dtypes if (x[1] == 'string')]

for i in columns:
    
    print(i)
    df.groupBy(i).count().orderBy('count').show()
    print()

gender
+------+-----+
|gender|count|
+------+-----+
|  Male| 2115|
|Female| 2995|
+------+-----+


ever_married
+------------+-----+
|ever_married|count|
+------------+-----+
|          No| 1757|
|         Yes| 3353|
+------------+-----+


work_type
+-------------+-----+
|    work_type|count|
+-------------+-----+
| Never_worked|   22|
|     Govt_job|  657|
|     children|  687|
|Self-employed|  819|
|      Private| 2925|
+-------------+-----+


Residence_type
+--------------+-----+
|Residence_type|count|
+--------------+-----+
|         Rural| 2514|
|         Urban| 2596|
+--------------+-----+


smoking_status
+---------------+-----+
| smoking_status|count|
+---------------+-----+
|         smokes|  789|
|formerly smoked|  885|
|   never smoked| 3436|
+---------------+-----+


