#### Case Study
##### _Clustering_
1. Lakukan clustering pada data `abalone_cluster.csv`
2. Gunakan model berikut sebagai percobaan (dengan mencoba 2 _cluster_):
 * _K-Means_
 * _Agglomerative Hierarchical Clustering_
3. Lakukan evaluasi dari model tersebut dengan _silhouette score_.


##### _Association rules_
1. _Load_ data pada `flowers.txt`
2. Ubah data sehingga memiliki format _one hot encoding_
3. Dapatkan _frequent itemsets_ dengan _support_ di atas 20%
4. Buat _rules_ dengan _metric_
 * _Confidence_ > 0.6
 * _Lift_ > 1.5

## Clustering

### Lakukan clustering pada data abalone_cluster.csv


In [21]:
import pandas as pd
import numpy as np

In [28]:
data = pd.read_csv('..//..//data//input//abalone_cluster.csv')
data.head(10)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,ring
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


### 2.Gunakan model berikut sebagai percobaan (dengan mencoba 2 cluster):
#### K-Means
#### Agglomerative Hierarchical Clustering


In [35]:
### saya tambahkan MinMaxScaler sebelum clustering

In [29]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(data)
data_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns)
data_scaled.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,ring
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0.5
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0.214286
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,0.285714
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0.321429
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0.214286


In [30]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering



In [None]:
###Agglomerative Hierarchical Clustering

In [33]:
agglo = AgglomerativeClustering(n_clusters=5)
data['cluster1'] = agglo.fit(data_scaled)
data.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,ring,cluster1,cluster2
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=4)
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=4)
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=4)
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=4)
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=4)


In [None]:
### K-Means

In [34]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
kmeans.fit(data_scaled)
data['cluster2'] = kmeans.fit(data_scaled)
data.head()



Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,ring,cluster1,cluster2
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=5)
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=5)
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=5)
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=5)
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,AgglomerativeClustering(n_clusters=5),KMeans(n_clusters=5)


### 3.Lakukan evaluasi dari model tersebut dengan silhouette score


In [36]:
from sklearn.metrics import silhouette_score


In [37]:
silhouette_score(data_scaled, agglo.fit_predict(data_scaled))

0.30728395051936375

In [38]:
silhouette_score(data_scaled, kmeans.fit_predict(data_scaled))



0.34727562515166743

### Association Rules

#### 1.Load data pada flowers.txt

In [42]:
import pandas as pd
import numpy as np


In [45]:
with open('..//..//data//input//flowers.txt') as f:
    transactions = f.read()
print(transactions)

Bougenvile, Dandelion, Lavender
Orchid, Tulip, Bougenvile, Lotus
Lotus
Orchid, Tulip, Sakura, Lotus, Dandelion, Lavender
Orchid, Rose, Tulip, Bougenvile, Lavender
Sunflower, Sakura, Bougenvile, Dandelion
Jasmine, Rose, Tulip, Sunflower, Lavender
Orchid, Jasmine, Rose, Sunflower, Sakura, Bougenvile, Lotus, Dandelion
Orchid, Tulip, Sunflower, Lotus, Lavender
Orchid, Jasmine, Sunflower, Bougenvile
Orchid, Jasmine, Tulip, Sunflower, Bougenvile, Lotus, Lavender
Orchid, Jasmine, Rose, Dandelion
Jasmine, Rose, Sunflower, Bougenvile, Lotus, Lavender
Orchid, Jasmine, Tulip, Sakura, Lavender
Jasmine, Rose, Lotus, Dandelion
Jasmine, Tulip, Sunflower, Sakura, Dandelion, Lavender
Orchid, Jasmine, Sunflower, Sakura, Lavender
Jasmine, Rose, Sunflower, Sakura, Bougenvile, Lotus, Dandelion
Tulip, Sakura, Bougenvile, Lotus, Dandelion
Sakura, Lotus, Dandelion, Lavender
Jasmine, Rose, Tulip, Lotus, Dandelion
Jasmine, Tulip, Sunflower, Bougenvile, Lavender
Jasmine, Sunflower, Bougenvile, Lavender
Orchid, T

### 2.Ubah data sehingga memiliki format one hot encoding

In [47]:
transactions = transactions.split('\n')
transactions = [trx.split(',')for trx in transactions]
transactions

[['Bougenvile', ' Dandelion', ' Lavender'],
 ['Orchid', ' Tulip', ' Bougenvile', ' Lotus'],
 ['Lotus'],
 ['Orchid', ' Tulip', ' Sakura', ' Lotus', ' Dandelion', ' Lavender'],
 ['Orchid', ' Rose', ' Tulip', ' Bougenvile', ' Lavender'],
 ['Sunflower', ' Sakura', ' Bougenvile', ' Dandelion'],
 ['Jasmine', ' Rose', ' Tulip', ' Sunflower', ' Lavender'],
 ['Orchid',
  ' Jasmine',
  ' Rose',
  ' Sunflower',
  ' Sakura',
  ' Bougenvile',
  ' Lotus',
  ' Dandelion'],
 ['Orchid', ' Tulip', ' Sunflower', ' Lotus', ' Lavender'],
 ['Orchid', ' Jasmine', ' Sunflower', ' Bougenvile'],
 ['Orchid',
  ' Jasmine',
  ' Tulip',
  ' Sunflower',
  ' Bougenvile',
  ' Lotus',
  ' Lavender'],
 ['Orchid', ' Jasmine', ' Rose', ' Dandelion'],
 ['Jasmine', ' Rose', ' Sunflower', ' Bougenvile', ' Lotus', ' Lavender'],
 ['Orchid', ' Jasmine', ' Tulip', ' Sakura', ' Lavender'],
 ['Jasmine', ' Rose', ' Lotus', ' Dandelion'],
 ['Jasmine', ' Tulip', ' Sunflower', ' Sakura', ' Dandelion', ' Lavender'],
 ['Orchid', ' Jasmi

In [48]:
#membuat item menjadi angka binary/ one-hot coding
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

vector = mlb.fit_transform(transactions)
items = mlb.classes_

data = pd.DataFrame(vector, columns=items)
data

Unnamed: 0,Bougenvile,Dandelion,Jasmine,Lavender,Lotus,Rose,Sakura,Sunflower,Tulip,Bougenvile.1,Jasmine.1,Lotus.1,Orchid,Rose.1,Sakura.1,Sunflower.1,Tulip.1
0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0
4,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0
71,0,1,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0
72,1,0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,0
73,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0


### 3.Dapatkan frequent itemsets dengan support di atas 20%

In [49]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(data, min_support=0.2, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.493333,( Bougenvile)
1,0.546667,( Dandelion)
2,0.28,( Jasmine)
3,0.453333,( Lavender)
4,0.466667,( Lotus)
5,0.346667,( Rose)
6,0.546667,( Sakura)
7,0.48,( Sunflower)
8,0.48,( Tulip)
9,0.226667,(Jasmine)


In [50]:
type(frequent_itemsets)

pandas.core.frame.DataFrame

In [51]:
#mengurutkan dari yang terbesar ke yang terkecil
frequent_itemsets.sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
1,0.546667,( Dandelion)
6,0.546667,( Sakura)
10,0.506667,(Orchid)
0,0.493333,( Bougenvile)
8,0.48,( Tulip)
7,0.48,( Sunflower)
4,0.466667,( Lotus)
3,0.453333,( Lavender)
22,0.346667,"( Dandelion, Sakura)"
5,0.346667,( Rose)


## 4.Buat rules dengan metric
### 1.Confidence > 0.6
### 2.Lift > 1.5

In [52]:
from mlxtend.frequent_patterns import association_rules

In [53]:
rules1 = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( Rose),( Dandelion),0.346667,0.546667,0.253333,0.730769,1.336773,0.063822,1.68381,0.385607
1,( Dandelion),( Sakura),0.546667,0.546667,0.346667,0.634146,1.160024,0.047822,1.239111,0.304299
2,( Sakura),( Dandelion),0.546667,0.546667,0.346667,0.634146,1.160024,0.047822,1.239111,0.304299
3,( Jasmine),(Orchid),0.28,0.506667,0.28,1.0,1.973684,0.138133,inf,0.685185
4,( Rose),( Lotus),0.346667,0.466667,0.24,0.692308,1.483516,0.078222,1.733333,0.498866
5,( Lotus),( Tulip),0.466667,0.48,0.28,0.6,1.25,0.056,1.3,0.375
6,( Lotus),(Orchid),0.466667,0.506667,0.28,0.6,1.184211,0.043556,1.233333,0.291667
7,( Rose),(Orchid),0.346667,0.506667,0.253333,0.730769,1.442308,0.077689,1.832381,0.469388
8,(Orchid),( Sunflower),0.506667,0.48,0.32,0.631579,1.315789,0.0768,1.411429,0.486486
9,( Sunflower),(Orchid),0.48,0.506667,0.32,0.666667,1.315789,0.0768,1.48,0.461538


In [54]:
rules1 = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( Jasmine),(Orchid),0.28,0.506667,0.28,1.0,1.973684,0.138133,inf,0.685185


In [56]:
rules2 = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Orchid),( Jasmine),0.506667,0.28,0.28,0.552632,1.973684,0.138133,1.609412,1.0
1,( Jasmine),(Orchid),0.28,0.506667,0.28,1.0,1.973684,0.138133,inf,0.685185
2,"(Orchid, Dandelion)",( Rose),0.293333,0.346667,0.2,0.681818,1.966783,0.098311,2.053333,0.695597
3,"( Rose, Dandelion)",(Orchid),0.253333,0.506667,0.2,0.789474,1.558172,0.071644,2.343333,0.479762
4,(Orchid),"( Rose, Dandelion)",0.506667,0.253333,0.2,0.394737,1.558172,0.071644,1.233623,0.726126
5,( Rose),"(Orchid, Dandelion)",0.346667,0.293333,0.2,0.576923,1.966783,0.098311,1.670303,0.752381


In [57]:
sorted_lift_rules = rules2.sort_values(by='lift', ascending=False)


In [58]:
sorted_lift_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Orchid),( Jasmine),0.506667,0.28,0.28,0.552632,1.973684,0.138133,1.609412,1.0
1,( Jasmine),(Orchid),0.28,0.506667,0.28,1.0,1.973684,0.138133,inf,0.685185
2,"(Orchid, Dandelion)",( Rose),0.293333,0.346667,0.2,0.681818,1.966783,0.098311,2.053333,0.695597
5,( Rose),"(Orchid, Dandelion)",0.346667,0.293333,0.2,0.576923,1.966783,0.098311,1.670303,0.752381
3,"( Rose, Dandelion)",(Orchid),0.253333,0.506667,0.2,0.789474,1.558172,0.071644,2.343333,0.479762
4,(Orchid),"( Rose, Dandelion)",0.506667,0.253333,0.2,0.394737,1.558172,0.071644,1.233623,0.726126
