# คำชี้แจง
ในสัปดาห์นี้ เราจะฝึกใช้ library scikit-learn และ umap-learn เพื่อทำการลดมิติข้อมูลสำหรับแสดงผลและตีความ

บน Colab ไม่มี umap-learn ดังนั้นต้อง install ก่อน

In [None]:
!pip install umap-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS

# ข้อมูลความแข็งแรงของคอนกรีต

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/ConcreteStrength.csv', index_col = None, header = 0)
data.columns = [c.split('(')[0].strip() for c in data.columns] ## clean up column names
data.head()

## ใช้เฉพาะตัวแปร input
ทำการ standardize

In [None]:
input_data = data.iloc[:, :-1]
std_data = (input_data - input_data.mean()) / input_data.std()

std_data.head()

# เริ่มจาก PCA บนข้อมูลทุกจุด

In [None]:
pca = PCA(random_state = 25).fit(std_data)
pca_embed = pca.transform(std_data)

plt.figure(figsize = (10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')

plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.bar(range(1, pca.n_components_ + 1), cumulative)
plt.plot([1, std_data.shape[1]], [0.95, 0.95], '--', color = 'tab:orange')
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')
plt.title('cumulative')

plt.tight_layout()
plt.show()

### ดู embedding ของข้อมูลบน PC1, PC2, PC3

In [None]:
plt.figure(figsize = (10, 5))

plt.subplot(1, 2, 1)
plt.scatter(pca_embed[:, 0], pca_embed[:, 1])
plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')

plt.subplot(1, 2, 2)
plt.scatter(pca_embed[:, 0], pca_embed[:, 2])
plt.xlabel('PCA component 1'); plt.ylabel('PCA component 3')

plt.tight_layout()
plt.show()

### ระบายสีด้วยค่าตัวแปรต่าง ๆ เพื่อทำความเข้าใจ

In [None]:
plt.figure(figsize = (15, 10))

for i, feature in enumerate(['Concrete compressive strength', 'Age', 'Cement', 'Water', 'Fly Ash', 'Fine Aggregate'], start = 1):
    plt.subplot(2, 3, i)
    plt.scatter(pca_embed[:, 0], pca_embed[:, 1], c = data[feature]);
    plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

## สนใจเฉพาะส่วนผสมของคอนกรีต
ตัดตัวแปรอายุออก

In [None]:
mix_data = std_data.iloc[:, :-1].drop_duplicates()
print(std_data.shape[0], mix_data.shape[0])

pca = PCA(random_state = 25).fit(mix_data)
pca_embed = pca.transform(mix_data)

plt.figure(figsize = (10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')

plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.bar(range(1, pca.n_components_ + 1), cumulative)
plt.plot([1, std_data.shape[1]], [0.95, 0.95], '--', color = 'tab:orange')
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')
plt.title('cumulative')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (15, 10))

for i, feature in enumerate(['Concrete compressive strength', 'Age', 'Cement', 'Water', 'Fly Ash', 'Fine Aggregate'], start = 1):
    plt.subplot(2, 3, i)
    plt.scatter(pca_embed[:, 0], pca_embed[:, 1], c = data[feature].loc[mix_data.index]);
    plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

### PC1 ดูจะให้น้ำหนักกับ Cement และ Fly Ash

In [None]:
print(pca.components_[0])

In [None]:
plt.figure(figsize = (7, 4))
plt.bar(range(mix_data.shape[1]), pca.components_[0])
plt.xticks(range(mix_data.shape[1]), labels = mix_data.columns, rotation = 90)
plt.ylabel('PC1 loading')
plt.show()

### PC2 ให้ความสำคัญกับ Blast Furnace Slag และ Water

In [None]:
plt.figure(figsize = (7, 4))
plt.bar(range(mix_data.shape[1]), pca.components_[1])
plt.xticks(range(mix_data.shape[1]), labels = mix_data.columns, rotation = 90)
plt.ylabel('PC2 loading')
plt.show()

## จริง ๆ แล้ว PCA ต้องการแค่ centering (mean = 0) แต่ไม่ต้องสเกลก็ได้

In [None]:
center_data = input_data - input_data.mean()
center_data = center_data.iloc[:, :-1].drop_duplicates()

In [None]:
pca = PCA(random_state = 25).fit(center_data)
pca_embed = pca.transform(center_data)

plt.figure(figsize = (10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')

plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.bar(range(1, pca.n_components_ + 1), cumulative)
plt.plot([1, pca.n_components_], [0.95, 0.95], '--', color = 'tab:orange')
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')
plt.title('cumulative')

plt.tight_layout()
plt.show()

### ตรวจสอบ loading ใหม่

In [None]:
plt.figure(figsize = (7, 5))
plt.imshow(pca.components_, cmap = 'RdBu', vmin = -1, vmax = 1)
plt.xticks(range(pca.components_.shape[1]), labels = center_data.columns, rotation = 90)
plt.yticks(range(pca.n_components_), labels = ['PC' + str(i) for i in range(1, pca.n_components_ + 1)])
plt.colorbar()
plt.show()

### เปรียบเทียบกับ variance ในข้อมูล
ดูสาเหตุที่ PCA ใหม่ไม่สนใจ Water และ Superplasticizer

In [None]:
plt.figure(figsize = (7, 4))
plt.bar(range(center_data.shape[1]), center_data.std())
plt.xticks(range(center_data.shape[1]), labels = center_data.columns, rotation = 90)
plt.ylabel('SD')
plt.show()

# ใช้ t-SNE
ระบายสีด้วย compressive strength

In [None]:
perplexities = [5, 15, 25, 50]

plt.figure(figsize = (15, 7))

for i, k in enumerate(perplexities, start = 1):
    plt.subplot(2, 4, i)
    tsne_embed = TSNE(n_components = 2, perplexity = k, random_state = 25).fit_transform(std_data)
    plt.scatter(tsne_embed[:, 0], tsne_embed[:, 1], c = data['Concrete compressive strength'])
    plt.xlabel('t-SNE 1'); plt.ylabel('t-SNE 2')
    plt.title('perplexity = ' + str(k))
    
    plt.subplot(2, 4, i + 4)
    plt.scatter(tsne_embed[:, 0], tsne_embed[:, 1], c = data['Age'])
    plt.xlabel('t-SNE 1'); plt.ylabel('t-SNE 2')
    plt.title('perplexity = ' + str(k))

plt.tight_layout()
plt.show()

ถ้าตั้งค่า preplexity ต่ำเกินไป ข้อมูลจะแตกเป็นกลุ่มย่อย ๆ ส่วนใหญ่ลองใช้กันที่ระหว่าง 5 - 50

สำหรับกรณีนี้ 25 หรือ 50 ดูใช้ได้

# ใช้ UMAP

In [None]:
neighbers = [5, 15, 25, 50]

plt.figure(figsize = (15, 7))

for i, n in enumerate(neighbers, start = 1):
    plt.subplot(2, 4, i)
    umap_embed = umap.UMAP(n_components = 2, n_neighbors = n, random_state = 25).fit_transform(std_data)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = data['Concrete compressive strength'])
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title('n_neighbors = ' + str(n))
    
    plt.subplot(2, 4, i + 4)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = data['Age'])
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title('n_neighbors = ' + str(n))

plt.tight_layout()
plt.show()

## ปรับค่า min_dist เพื่อเพิ่มหรือลดการกระจายตัวของจุด (สำหรับดูด้วยตา)

In [None]:
dists = [0.1, 0.5, 1.0]

plt.figure(figsize = (12, 4))

for i, d in enumerate(dists, start = 1):
    plt.subplot(1, 3, i)
    umap_embed = umap.UMAP(n_components = 2, n_neighbors = 50, min_dist = d, random_state = 25).fit_transform(std_data)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = data['Concrete compressive strength'])
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title('min_dist = ' + str(d))

plt.tight_layout()
plt.show()

## ดูการกระจายตัวของตัวแปรต่าง ๆ บน UMAP

In [None]:
umap_embed = umap.UMAP(n_components = 2, n_neighbors = 50, min_dist = 0.5, random_state = 25).fit_transform(std_data)

plt.figure(figsize = (15, 6))

for i, feature in enumerate(data.columns, start = 1):
    plt.subplot(2, 5, i)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = data[feature]);
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

# ข้อมูลราคาบ้าน

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/HousePrices.csv', index_col = 0, header = 0)
data.head()

## จัดการเรื่อง missing data

In [None]:
missing_count = pd.isna(data).sum()
with_missing = missing_count.index[missing_count > 0]

plt.figure()
plt.barh(range(len(with_missing)), missing_count[with_missing])
plt.yticks(range(len(with_missing)), with_missing)
plt.xlabel('missing value count')
plt.show()

ตัดตัวแปรที่หายไปมาก ๆ ออก

In [None]:
filtered_data = data.drop(missing_count.index[missing_count > 500], axis = 1)
print('from', data.shape[1], 'to', filtered_data.shape[1], 'features')

เติมค่าด้วย mode หรือ mean ตามชนิดของตัวแปร

In [None]:
imputed_data = filtered_data.copy()

imputed_data['GarageType'] = filtered_data['GarageType'].fillna(filtered_data['GarageType'].mode()[0])
imputed_data['LotFrontage'] = np.round(filtered_data['LotFrontage'].fillna(filtered_data['LotFrontage'].mean()))

### ดึงเฉพาะ feature ที่เป็นตัวเลขออกมา

In [None]:
numeric_data = imputed_data.select_dtypes(include = [np.number]).iloc[:, :-1]
std_data = (numeric_data - numeric_data.mean()) / numeric_data.std()

## ใช้ PCA

In [None]:
pca = PCA(random_state = 25).fit(std_data)
pca_embed = pca.transform(std_data)

plt.figure(figsize = (10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')

plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.bar(range(1, pca.n_components_ + 1), cumulative)
plt.plot([1, std_data.shape[1]], [0.95, 0.95], '--', color = 'tab:orange')
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')
plt.title('cumulative')

plt.tight_layout()
plt.show()

### ดู loading ของ PC1 ที่ variance สูง

In [None]:
plt.figure(figsize = (14, 4))
plt.bar(range(numeric_data.shape[1]), pca.components_[0])
plt.xticks(range(numeric_data.shape[1]), labels = numeric_data.columns, rotation = 90)
plt.ylabel('PC1 loading')
plt.show()

### ระบายสีด้วยตัวแปรต่าง ๆ

In [None]:
plt.figure(figsize = (12, 3))

for i, feature in enumerate(['SalePrice', 'OverallQual', 'GarageArea', 'Fence'], start = 1):
    plt.subplot(1, 4, i)
    plt.scatter(pca_embed[:, 0], pca_embed[:, 1], c = data[feature]);
    plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

## ใช้ t-SNE

In [None]:
perplexities = [5, 15, 25, 50]

plt.figure(figsize = (12, 3))

for i, k in enumerate(perplexities, start = 1):
    plt.subplot(1, 4, i)
    tsne_embed = TSNE(n_components = 2, perplexity = k, random_state = 25).fit_transform(std_data)
    plt.scatter(tsne_embed[:, 0], tsne_embed[:, 1], c = data['SalePrice'])
    plt.xlabel('t-SNE 1'); plt.ylabel('t-SNE 2')
    plt.title('perplexity = ' + str(k))

plt.tight_layout()
plt.show()

## ใช้ UMAP

In [None]:
neighbers = [5, 15, 25, 50]

plt.figure(figsize = (12, 3))

for i, n in enumerate(neighbers, start = 1):
    plt.subplot(1, 4, i)
    umap_embed = umap.UMAP(n_components = 2, n_neighbors = n, min_dist = 0.5, random_state = 25).fit_transform(std_data)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = data['SalePrice'])
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title('n_neighbors = ' + str(n))

plt.tight_layout()
plt.show()

### ระบายสีด้วยตัวแปร 10 ตัวแรก

In [None]:
umap_embed = umap.UMAP(n_components = 2, n_neighbors = 25, min_dist = 0.5, random_state = 25).fit_transform(std_data)

plt.figure(figsize = (15, 6))

for i, feature in enumerate(numeric_data.columns[:10], start = 1):
    plt.subplot(2, 5, i)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = numeric_data[feature]);
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

# ข้อมูลกลุ่มลูกค้า (CustomerSegment)
จัดกลุ่มลูกค้าจากข้อมูลส่วนบุคคล
* ตัวแปร Work_Experience มีข้อมูลหายมากเกินไป
* ตัวแปร Profession ไม่สามารถ impute ได้

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/CustomerSegment.csv', index_col = 0)
data = data.drop('Work_Experience', axis = 1)
data = data.loc[~pd.isna(data['Profession']), :]
data.head()

## impute ข้อมูล

In [None]:
categorical_features = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']

imputed_data = data.copy()
imputed_data.loc[:, categorical_features] = imputed_data.loc[:, categorical_features].fillna(data[categorical_features].mode().iloc[0])
imputed_data.loc[:, 'Family_Size'] = imputed_data['Family_Size'].fillna(data['Family_Size'].mean()).astype(int) ## ปรับจำนวนสมาชิกในครอบครัวเป็นจำนวนเต็ม

In [None]:
pd.isna(imputed_data).sum()

## แปลงข้อมูลตัวอักษรให้เป็นตัวเลข

In [None]:
numeric_data = imputed_data.copy()
numeric_data.drop(['Profession', 'Var_1', 'Segmentation'], axis = 1, inplace = True)

numeric_data.loc[:, 'Gender'] = (imputed_data['Gender'] == 'Male').astype(int)
numeric_data.loc[:, 'Ever_Married'] = (imputed_data['Ever_Married'] == 'Yes').astype(int)
numeric_data.loc[:, 'Graduated'] = (imputed_data['Graduated'] == 'Yes').astype(int)

numeric_data.loc[:, 'Spending_Score'] = numeric_data['Spending_Score'].map({'Low':0, 'Average':1, 'High':2})
numeric_data.head()

## ใช้ PCA

In [None]:
std_data = (numeric_data - numeric_data.mean()) / numeric_data.std()

In [None]:
pca = PCA(random_state = 25).fit(std_data)
pca_embed = pca.transform(std_data)

plt.figure(figsize = (10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')

plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.bar(range(1, pca.n_components_ + 1), cumulative)
plt.plot([1, std_data.shape[1]], [0.95, 0.95], '--', color = 'tab:orange')
plt.xlabel('PCA component')
plt.ylabel('explained variance ratio')
plt.title('cumulative')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (7, 4))
plt.bar(range(numeric_data.shape[1]), pca.components_[0])
plt.xticks(range(numeric_data.shape[1]), labels = numeric_data.columns, rotation = 90)
plt.ylabel('PC1 loading')
plt.show()

### ระบายสีด้วยตัวแปรต่าง ๆ

In [None]:
plt.figure(figsize = (12, 6))

for i, feature in enumerate(numeric_data.columns, start = 1):
    plt.subplot(2, 4, i)
    plt.scatter(pca_embed[:, 0], pca_embed[:, 1], c = numeric_data[feature]);
    plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

## ใช้ UMAP

In [None]:
umap_embed = umap.UMAP(n_components = 2, n_neighbors = 25, min_dist = 1, random_state = 25).fit_transform(std_data)

plt.figure(figsize = (12, 6))

for i, feature in enumerate(numeric_data.columns, start = 1):
    plt.subplot(2, 4, i)
    plt.scatter(umap_embed[:, 0], umap_embed[:, 1], c = numeric_data[feature]);
    plt.xlabel('UMAP 1'); plt.ylabel('UMAP 2')
    plt.title(feature)

plt.tight_layout()
plt.show()

### ระบายสีด้วยอาชีพ

In [None]:
plt.figure(figsize = (10, 5))

plt.subplot(1, 2, 1)

for prof in sorted(pd.unique(imputed_data['Profession'])):
    filt = imputed_data['Profession'] == prof
    plt.scatter(pca_embed[filt, 0], pca_embed[filt, 1], label = prof, alpha = 0.6);

plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
plt.legend()

plt.subplot(1, 2, 2)

for prof in sorted(pd.unique(imputed_data['Profession'])):
    filt = imputed_data['Profession'] == prof
    plt.scatter(umap_embed[filt, 0], umap_embed[filt, 1], label = prof, alpha = 0.6);

plt.xlabel('UMAP1'); plt.ylabel('UMAP2')

plt.tight_layout()
plt.show()

### ระบายสีด้วยกลุ่มลูกค้า

In [None]:
plt.figure(figsize = (10, 5))

plt.subplot(1, 2, 1)

for seg in sorted(pd.unique(imputed_data['Segmentation'])):
    filt = imputed_data['Segmentation'] == seg
    plt.scatter(pca_embed[filt, 0], pca_embed[filt, 1], label = seg, alpha = 0.4);

plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
plt.legend()

plt.subplot(1, 2, 2)

for seg in sorted(pd.unique(imputed_data['Segmentation'])):
    filt = imputed_data['Segmentation'] == seg
    plt.scatter(umap_embed[filt, 0], umap_embed[filt, 1], label = seg, alpha = 0.4);

plt.xlabel('UMAP1'); plt.ylabel('UMAP2')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
colors = ['tab:red', 'tab:green', 'tab:orange', 'tab:blue']
segments = sorted(pd.unique(imputed_data['Segmentation']), reverse = True)

plt.subplot(1, 2, 1)

for seg, color in zip(segments, colors):
    filt = imputed_data['Segmentation'] == seg
    plt.scatter(pca_embed[filt, 0], pca_embed[filt, 1], label = seg, alpha = 0.4, c = color);

plt.xlabel('PCA component 1'); plt.ylabel('PCA component 2')
plt.legend()

plt.subplot(1, 2, 2)

for seg, color in zip(segments, colors):
    filt = imputed_data['Segmentation'] == seg
    plt.scatter(umap_embed[filt, 0], umap_embed[filt, 1], label = seg, alpha = 0.4, c = color);

plt.xlabel('UMAP1'); plt.ylabel('UMAP2')

plt.tight_layout()
plt.show()