-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering_missing.py
47 lines (32 loc) · 1.53 KB
/
clustering_missing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import numpy as np
def cluster_missing(X_train):
# create cluster and fill in missing values with most_common / median value
if X_train.shape[1] == 59:
X = X_train.drop(["id", "target"], axis = 1)
else:
X = X_train.drop(["id"], axis = 1)
# Drop from the data-set the columns with missing values
na_count = X.isnull().sum()
na_columns = list(na_count[na_count>0].index.values)
X_no_missing = X.drop(na_columns, axis = 1)
cat_columns_no_missing = list(filter(lambda x: x.endswith("cat"),
X_no_missing.columns.values))
X_no_missing_dummies = pd.get_dummies(X_no_missing, columns = cat_columns_no_missing)
#train kmeans
kmeans = MiniBatchKMeans(n_clusters = 15, random_state = 0, batch_size = 2000)
kmeans.fit(X_no_missing_dummies)
X["cluster"] = kmeans.labels_
# find median or most common value per cluster for missing values
Values_replace_missing = pd.DataFrame()
for col in na_columns:
clean_df = X[["cluster", col]].dropna()
if col.endswith("cat"):
Values_replace_missing[col] = clean_df.groupby(["cluster"]).agg(lambda x:x.value_counts().index.values[0])
else:
Values_replace_missing[col] = clean_df.groupby(["cluster"]).median()
#replace missing values with median or most common value in the same cluster
for cl, cat in ((x, y) for x in range(15) for y in na_columns):
X_train.loc[(X["cluster"] == cl) & pd.isnull(X[cat]), cat] = Values_replace_missing.loc[cl, cat]
return X_train