In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.types import DoubleType,IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession.builder.appName("KMeans").getOrCreate()

In [3]:
df = spark.read.csv("dataset_diabetes/diabetic_data.csv", header=True, inferSchema=True)
df.show()

+------------+-----------+---------------+------+--------+------+-----------------+------------------------+-------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+----------+
|encounter_id|patient_nbr|           race|gender|     age|weight|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|payer_code|   medical_specialty|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_inpatient|diag_1|diag_2|diag_3|number_diagn

In [4]:
diabetes_df = df.select("admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital", "num_lab_procedures")
diabetes_df.head()

Row(admission_type_id=6, discharge_disposition_id=25, admission_source_id=1, time_in_hospital=1, num_lab_procedures=41)

In [5]:
# vector assembler for feature columns
assembler = VectorAssembler(inputCols=diabetes_df.columns, outputCol="features")
data = assembler.transform(diabetes_df)

In [6]:
data

DataFrame[admission_type_id: int, discharge_disposition_id: int, admission_source_id: int, time_in_hospital: int, num_lab_procedures: int, features: vector]

In [7]:
#k-means model.
kmeans = KMeans().setK(2).setSeed(1)

model = kmeans.fit(data)

In [8]:
# Make predictions
predictions = model.transform(data)

# Shows the result.
ctr=[]
centers = model.clusterCenters()
for center in centers:
    ctr.append(center)
    print(center)

[ 2.28821023  3.58880208  5.55565814  3.50350379 24.50454545]
[ 1.83652522  3.80564795  5.89549105  5.02929812 56.2879918 ]


In [9]:
import pandas as pd
pandasDF=predictions.toPandas()
centers = pd.DataFrame(ctr,columns=diabetes_df.columns)

In [10]:
pandasDF

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,features,prediction
0,6,25,1,1,41,"[6.0, 25.0, 1.0, 1.0, 41.0]",1
1,1,1,7,3,59,"[1.0, 1.0, 7.0, 3.0, 59.0]",1
2,1,1,7,2,11,"[1.0, 1.0, 7.0, 2.0, 11.0]",0
3,1,1,7,2,44,"[1.0, 1.0, 7.0, 2.0, 44.0]",1
4,1,1,7,1,51,"[1.0, 1.0, 7.0, 1.0, 51.0]",1
...,...,...,...,...,...,...,...
101761,1,3,7,3,51,"[1.0, 3.0, 7.0, 3.0, 51.0]",1
101762,1,4,5,5,33,"[1.0, 4.0, 5.0, 5.0, 33.0]",0
101763,1,1,7,1,53,"[1.0, 1.0, 7.0, 1.0, 53.0]",1
101764,2,3,7,10,45,"[2.0, 3.0, 7.0, 10.0, 45.0]",1
