## Exploratory data analysis (part 2)
Unsupervised learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn import datasets, cluster
from matplotlib import pyplot as plt
from scipy import ndimage
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
X, y = make_blobs(n_samples=150,
                  n_features=2,
                  centers=3,
                  cluster_std=0.5,
                  shuffle=True,
                  random_state=0)

In [None]:
print(X[0:5])

In [None]:
plt.scatter(X[:,0], X[:,1],
                  c = 'blue',
                  marker='o',
                  s=50)


plt.xlabel('X[:,0]')
plt.ylabel('X[:,1]')
plt.title('Cluster')
plt.grid()
plt.show()

### K-Means
It requires that we know the number of clusters

In [None]:
km = KMeans(n_clusters=3,
            init= 'random',
            n_init=10,
            max_iter=300,
            tol=1e-04,
            random_state=0)

y_km = km.fit_predict(X)

In [None]:
plt.scatter(X[y_km==0, 0], 
            X[y_km==0, 1],
            s=50,
            c='lightgreen',
            marker='s',
            label='cluster1'
            )

plt.scatter(X[y_km==1, 0], 
            X[y_km==1, 1],
            s=50,
            c='orange',
            marker='s',
            label='cluster1'
            )

plt.scatter(X[y_km==2, 0], 
            X[y_km==2, 1],
            s=50,
            c='lightblue',
            marker='s',
            label='cluster1'
            )

plt.scatter(km.cluster_centers_[:,0],
            km.cluster_centers_[:,1],
            s=250,
            marker='*',
            c='red',
            label='centroids')

plt.legend()
plt.grid()
plt.show()


In [None]:
# Example: why we need to know numbers of clusters
X, y = make_blobs(n_samples=150,
                  n_features=2,
                  centers=5,        # Numbers of clusters
                  cluster_std=0.5,
                  shuffle=True,
                  random_state=0)

In [None]:
plt.scatter(X[:,0], X[:,1],
            c='blue',
            marker='o',
            s=20)

plt.grid()
plt.show()

In [None]:
km = KMeans(n_clusters=4)

y_km = km.fit_predict(X)

In [None]:
plt.scatter(X[y_km==0, 0], 
            X[y_km==0, 1],
            s=50,
            c='lightgreen',
            marker='s',
            label='cluster1'
            )

plt.scatter(X[y_km==1, 0], 
            X[y_km==1, 1],
            s=50,
            c='orange',
            marker='s',
            label='cluster2'
            )

plt.scatter(X[y_km==2, 0], 
            X[y_km==2, 1],
            s=50,
            c='lightblue',
            marker='s',
            label='cluster3'
            )

plt.scatter(X[y_km==3, 0], 
            X[y_km==3, 1],
            s=50,
            c='pink',
            marker='s',
            label='cluster4'
            )

plt.scatter(km.cluster_centers_[:,0],
            km.cluster_centers_[:,1],
            s=250,
            marker='*',
            c='red',
            label='centroids')

plt.legend()
plt.grid()
plt.show()


In [None]:
distortion = []
for i in range(1,11):
    km = KMeans(n_clusters = i, init='k-means++', n_init=10)
    km.fit(X)
    distortion.append(km.inertia_)

distortion


In [None]:
plt.plot(range(1,11), distortion, marker = 'o')
plt.xlabel('#Clusters')
plt.ylabel('distortions')
plt.title('Elbow method')
plt.grid()
plt.show()


### Clustering with Iris dataset

In [None]:
iris = datasets.load_iris()
X_iris = iris.data
X_iris[0:5]

#### Elbou-method

In [None]:
distortion = []
for i in range(1,11):
    km = cluster.KMeans(n_clusters=i, n_init=10,
                         init='k-means++') # k-means ++ (default) selects initial 
                                            # cluster center for k-mean clustering
                                            # in a smart way to speed up convergence.
    km.fit(X_iris)
    distortion.append(km.inertia_)                                                        

In [None]:
plt.plot(range(1,11), distortion, marker = 'o')
plt.xlabel('#Clusters')
plt.ylabel('distortions')
plt.title('IRIS dataset')
plt.grid()
plt.show()

#### k-means clustering

In [None]:
k_means = cluster.KMeans(n_clusters=3, n_init=10)
k_means.fit(X_iris)
k_means.get_params()

In [None]:
# The cluster labels
k_means.labels_

In [None]:
# The cluster centroids 
k_means.cluster_centers_

In [None]:
# Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided.
k_means.inertia_

In [None]:
unseen_data = np.array([[5.5, 2.5, 4.5, 1.5], 
                        [7.0, 3.0, 6.0, 2.0]])


In [None]:
predictions = k_means.predict(unseen_data)
print('unseen_data 1:', unseen_data[0])
print('predicted cluster:', predictions[0])
print(' ')
print('unseen_data 2:', unseen_data[1])
print('predicted cluster:', predictions[1])

#### Clustering in map

In [None]:
my_image = plt.imread('Data/europe.jpg')
plt.imshow(my_image)
plt.show()


In [None]:
x, y, z = my_image.shape
my_image_2d = my_image.reshape(x*y, z)
my_image_2d.shape

In [None]:
kmeans = cluster.KMeans(n_clusters=2, n_init=10)
kmeans.fit(my_image_2d)
cluster_labels = kmeans.labels_
cluster_centroids = kmeans.cluster_centers_

In [None]:
output = cluster_centroids[cluster_labels].reshape(x,y,z)
plt.imshow((output * 255).astype(np.uint8))

In [None]:
kmeans = cluster.KMeans(n_clusters=3, n_init=10)
kmeans.fit(my_image_2d)
cluster_labels = kmeans.labels_
cluster_centroids = kmeans.cluster_centers_

In [None]:
my_image_3d = cluster_centroids[cluster_labels].reshape(x,y,z)
plt.imshow((my_image_3d * 255).astype(np.uint8))

In [None]:
def generate_2D_clusters(n_samples, n_centers):
    n_features = 2
    X, y = make_blobs(n_samples, n_features, n_centers, cluster_std=0.5)
    return X, y

In [None]:
def predict_and_plot_2D_clusters(n_samples, n_clusters):
    X, y = generate_2D_clusters(n_samples, n_clusters)
    km = KMeans(n_clusters)
    y = km.fit_predict(X)
    for clus in range(n_clusters):
        plt.scatter(X[y==clus, 0], 
                    X[y==clus, 1], 
                    label = 'cluster')
        plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 
                    s=250, marker='*', c='red', label = 'centroids')
    plt.xlabel('X[:,0]')
    plt.ylabel('X[:,1]')
    plt.title('Clusters')
    plt.grid()
    plt.show()    
    


In [None]:
# predict_and_plot_2D_clusters(250, 6)

### Principle Component Analysis (PCA)
Unsupervised transformation. Dimension reduction

In [None]:
iris
X_iris = iris.data
y_iris = iris.target

In [None]:
X_iris_stand = StandardScaler().fit_transform(X_iris)

In [None]:
print('Orgiginal Iris: \n', X_iris[:10])
print(' ')
print('Standardized Iris: \n', (X_iris_stand[:10]))

In [None]:
pca = PCA(n_components=2)
X_iris_PCA = pca.fit_transform(X_iris_stand)

In [None]:
print('PC1', pca.components_[0])
print(' ')
print('PC2', pca.components_[1])


In [None]:
df_iris_pca = pd.DataFrame( data = X_iris_PCA)
df_iris_pca.columns = ['PC 1', 'PC 2']
df_iris_target = pd.DataFrame(y_iris)

df_iris_all = pd.concat([df_iris_pca, df_iris_target], axis = 1)
df_iris_all

In [None]:
for i in range(0,4):
    indices = (df_iris_target.iloc[:,0] == i)
    plt.scatter(df_iris_all.loc[indices, 'PC 1'],
                df_iris_all.loc[indices, 'PC 2'])
    
plt.title('Principal Component Analysis (PCA)')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.grid()
plt.show()




In [None]:
# Standardize df_growth
df_growth = pd.read_csv('Data/data_growth.csv')

plt.scatter(df_growth.loc[:,'x1'], df_growth.loc[:,'x2'])
plt.title('Growth data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

In [None]:
# Testing and show
print(df_growth.loc[:,'x1'])
print(df_growth.loc[:,'x2'])

In [None]:
# standardizing the data
df_growth_sd = StandardScaler().fit_transform(df_growth)

In [None]:
pca = PCA(n_components=2)
df_growth_pca= pca.fit_transform(df_growth_sd)

In [None]:
# Principal components (directions of maximum variance in the data)
pca.components_

In [None]:
# Percentage of variance explained by each principal components
pca.singular_values_

In [None]:
# The singular values corresponding to the principal components
pca.explained_variance_ratio_

In [None]:
# Original data (before applying PCA)
plt.scatter(df_growth.loc[:,'x1'], df_growth.loc[:,'x2'])
plt.title('Growth data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

In [None]:
# Transformed data (after applying PCA)
plt.scatter(df_growth_pca[:,0], df_growth_pca[:,1])
plt.title('Principal component analysis')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.yticks(np.arange(-1,1.5,0.5))
plt.show()