In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("Flu_Classification.csv")
df.head()

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,Diagnosis
0,67.0,38.11,F,unknown,Yes,Yes,unknown,No,unknown,No,unknown,H1N1
1,29.0,0.0,M,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1
2,22.0,0.0,F,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1
3,20.0,36.56,F,unknown,Yes,Yes,unknown,No,unknown,Yes,unknown,H1N1
4,21.0,0.0,M,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1


In [3]:
df.shape

(1485, 12)

In [4]:
# Converting string categorical variables into numeric categorical variables

def convert(df, n, k) :
    for col in df.iloc[:,n:k].columns:
        df[col] = df[col].astype('category').cat.codes
        
    return df

In [5]:
# Kmeans Clustering
def kmeans(df, n):
    
    kmeans = KMeans(n_clusters=n)
    df['cluster'] = kmeans.fit_predict(df)

    centroids = kmeans.cluster_centers_
    cen_x = [i[0] for i in centroids] 
    cen_y = [i[1] for i in centroids]

    df['cen_x'] = df.cluster.map({0:cen_x[0], 1:cen_x[1]})
    df['cen_y'] = df.cluster.map({0:cen_y[0], 1:cen_y[1]})
    
    return df      

In [6]:
df = convert(df, 2,12)
df_cluster = df.iloc[:,0:11]
df_cluster = kmeans(df_cluster, 2)

df_cluster.head()

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,cluster,cen_x,cen_y
0,67.0,38.11,0,2,1,1,2,0,2,0,1,1,28.967043,38.06746
1,29.0,0.0,1,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
2,22.0,0.0,0,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
3,20.0,36.56,0,2,1,1,2,0,2,1,1,1,28.967043,38.06746
4,21.0,0.0,1,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14


In [7]:
# pca = PCA(n_components=2)
# components = pca.fit_transform(df_cluster)

# fig1 = px.scatter(components, x=0, y=1, color=df['time'])
# fig1.add_scatter(x = centroids[:, 0], y =  centroids[:, 1])
# fig1.show

# plt.scatter(components[:,0], components[:,1])
# plt.scatter(centroids[:, 0], centroids[:, 1])
# plt.show()

# plt.scatter(df_cluster[y_kmeans==0, components[:,0]], df_cluster[y_kmeans==0, components[:,1]], s=100, c='red', label ='Cluster 1')
# plt.scatter(df_cluster[y_kmeans==1, components[:,0]], df_cluster[y_kmeans==1, components[:,1]], s=100, c='blue', label ='Cluster 2')
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label = 'Centroids')
# plt.show()

# colors = hex(['#DF2020', '#81DF20'])
# df_cluster['c'] = df_cluster.cluster.map({0:colors[0], 1:colors[1]})
# plt.scatter(components[:,0], centroids[:, 1], c=df_cluster.c, alpha = 0.6, s=10)


In [8]:
df.head()

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,Diagnosis
0,67.0,38.11,0,2,1,1,2,0,2,0,1,1
1,29.0,0.0,1,2,2,2,2,2,2,2,1,1
2,22.0,0.0,0,2,2,2,2,2,2,2,1,1
3,20.0,36.56,0,2,1,1,2,0,2,1,1,1
4,21.0,0.0,1,2,2,2,2,2,2,2,1,1


In [10]:
# Visualize cluster 1
df_cluster[df_cluster['cluster'] == 0]

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,cluster,cen_x,cen_y
1,29.00,0.0,1,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
2,22.00,0.0,0,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
4,21.00,0.0,1,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
5,22.00,0.0,0,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
6,19.00,0.0,0,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,1.40,0.0,2,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
1481,0.83,0.0,2,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
1482,3.00,0.0,2,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14
1483,4.00,0.0,2,2,2,2,2,2,2,2,1,0,23.833692,3.552714e-14


In [11]:
# Visualize cluster 2 
df_cluster[df_cluster['cluster'] == 1]

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,cluster,cen_x,cen_y
0,67.0,38.11,0,2,1,1,2,0,2,0,1,1,28.967043,38.067456
3,20.0,36.56,0,2,1,1,2,0,2,1,1,1,28.967043,38.067456
7,60.0,38.17,1,2,0,1,2,0,2,0,1,1,28.967043,38.067456
8,46.0,38.61,0,2,1,1,2,0,2,0,1,1,28.967043,38.067456
10,53.0,36.22,1,2,0,0,2,0,2,0,1,1,28.967043,38.067456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,79.0,37.80,1,2,1,2,2,2,2,2,1,1,28.967043,38.067456
1430,50.0,39.00,1,2,1,1,2,2,2,2,1,1,28.967043,38.067456
1433,23.0,38.70,0,2,1,2,1,2,0,2,1,1,28.967043,38.067456
1436,61.0,39.00,1,2,1,1,1,2,2,2,1,1,28.967043,38.067456
