## K-means應用：信用卡客戶分群

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score

df = pd.read_csv('customer.csv')
dict1 = {'男':1, '女':2}
df['性別'].replace(dict1, inplace=True)
df

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100)
0,1,34,78,1
1,1,20,61,49
2,1,29,28,82
3,2,31,81,93
4,1,48,60,49
...,...,...,...,...
195,2,33,86,95
196,1,35,28,61
197,2,68,48,48
198,1,38,71,75


In [2]:
km = KMeans(n_clusters=3)
km.fit(df)
km.labels_

array([2, 0, 0, 1, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 1,
       2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 1, 1, 0, 0, 2, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 1, 1,
       0, 1, 0, 2, 2, 1, 0, 0, 0, 0, 1, 2, 0, 0, 2, 0, 0, 2, 2, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 2, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 2, 2, 0, 0, 0, 0, 2, 0, 2, 1, 1,
       0, 0, 1, 0, 2, 0, 1, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 2, 2, 0, 1, 2, 0, 1, 0, 0,
       1, 1])

In [3]:
df['類別'] = km.labels_
df.head()

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,2
1,1,20,61,49,0
2,1,29,28,82,0
3,2,31,81,93,1
4,1,48,60,49,0


In [None]:
df2 = df[df['類別']==0]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
3,2,31,81,93,0
9,2,32,103,69,0
10,2,29,98,88,0
21,1,32,126,74,0
36,1,28,77,97,0
37,1,32,73,73,0
42,1,39,78,88,0
45,1,30,137,83,0
51,1,28,87,75,0
58,1,40,71,95,0


In [5]:
df2 = df[df['類別']==1]
df3 = df2.iloc[0:30, :]
df3

NameError: name 'df' is not defined

In [None]:
df2 = df[df['類別']==2]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
1,1,20,61,49,2
2,1,29,28,82,2
4,1,48,60,49,2
5,2,42,34,17,2
6,1,26,54,54,2
8,1,48,39,36,2
11,2,21,33,81,2
12,1,59,54,47,2
13,2,23,62,41,2
15,1,68,63,43,2


In [5]:
for n in range(2,15): 
  km = KMeans(n_clusters=n)
  km.fit(df)
  metric = calinski_harabasz_score(df, km.labels_) 
  print('群組數量：{}，評分：{}'.format(n, metric))

群組數量：2，評分：89.2106762819744
群組數量：3，評分：113.75242426357462
群組數量：4，評分：128.00296114672895
群組數量：5，評分：150.97200777880911
群組數量：6，評分：166.62762062886958
群組數量：7，評分：162.02591505102095
群組數量：8，評分：163.43776856785078
群組數量：9，評分：156.06100951882274
群組數量：10，評分：154.40361742356706
群組數量：11，評分：150.0028116968461
群組數量：12，評分：146.70282500939192
群組數量：13，評分：145.05700238137842
群組數量：14，評分：142.11662540494783


## DBSCAN應用：信用卡客戶分群

In [6]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('customer.csv')
dict1 = {'男':1, '女':2}
df['性別'].replace(dict1, inplace=True)
df

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100)
0,1,34,78,1
1,1,20,61,49
2,1,29,28,82
3,2,31,81,93
4,1,48,60,49
...,...,...,...,...
195,2,33,86,95
196,1,35,28,61
197,2,68,48,48
198,1,38,71,75


In [7]:
scaler = StandardScaler()
scaler.fit(df)
dfScaled = scaler.transform(df)
dbscan = DBSCAN(eps=0.82, min_samples=5)
dbs = dbscan.fit(dfScaled)
dbs.labels_

array([ 0,  1,  2,  3,  0,  3,  1,  0,  0,  3,  3,  3,  0,  3,  4,  0,  0,
        3,  0,  3, -1, -1,  4,  0,  4, -1,  0,  3,  1, -1,  2,  3,  0,  0,
        0,  1,  1,  1,  3,  3,  3,  0,  1,  3,  1, -1,  3,  3,  3,  0,  3,
        1,  3,  3, -1,  3,  0, -1,  1,  3,  0, -1,  3,  3,  3,  3,  3,  3,
       -1, -1,  4,  1,  1,  3,  3,  2,  3,  4,  1,  3,  3,  0,  3,  0,  0,
        1,  3,  3,  0,  3,  3,  3,  3,  1,  3,  3,  3,  3,  4,  3,  0,  2,
        3, -1,  3,  0,  3,  2,  1,  0,  3,  1,  3,  1,  0,  0,  3,  1,  3,
        3, -1,  3,  3,  0,  3,  3,  3,  0,  3,  2,  2,  0,  3,  3,  0,  3,
        0, -1,  1,  4,  3,  3,  1,  3,  4,  2,  3, -1,  3,  4,  0,  4,  3,
        3,  3,  3,  3,  3,  0,  3,  3,  3,  0,  3, -1,  0,  3,  1,  0,  3,
        3,  3,  3,  3,  0,  3,  3,  1,  0,  3,  0, -1, -1,  4,  3,  3, -1,
        1,  3,  0,  4,  3,  3,  4,  3,  3,  2,  3,  1,  1], dtype=int64)

In [8]:
df['類別'] = dbs.labels_
df.head()

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,0
1,1,20,61,49,1
2,1,29,28,82,2
3,2,31,81,93,3
4,1,48,60,49,0


In [9]:
df2 = df[df['類別']==0]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,0
4,1,48,60,49,0
7,1,43,78,17,0
8,1,48,39,36,0
12,1,59,54,47,0
15,1,68,63,43,0
16,1,57,54,51,0
18,1,42,86,20,0
23,1,49,62,56,0
26,1,53,46,46,0


In [10]:
df2 = df[df['類別']==1]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
1,1,20,61,49,1
6,1,26,54,54,1
28,1,26,62,55,1
35,1,18,59,41,1
36,1,28,77,97,1
37,1,32,73,73,1
42,1,39,78,88,1
44,1,24,60,52,1
51,1,28,87,75,1
58,1,40,71,95,1


In [11]:
df2 = df[df['類別']==-1]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
20,2,45,126,28,-1
21,1,32,126,74,-1
25,2,20,16,6,-1
29,1,19,15,39,-1
45,1,30,137,83,-1
54,1,64,19,3,-1
57,1,59,71,11,-1
61,2,35,19,99,-1
68,1,67,19,14,-1
69,1,33,113,8,-1
