# Home Work Assignment 3: Bootstrap

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('star_dataset.csv')
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


Best partition from previous work is determined by the following parameters.

In [2]:
n_cluster = 4
random_state = 9

In [3]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

quantitative_columns = ['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)']
X = df[quantitative_columns]
X = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=n_cluster, n_init=1, max_iter=500, init='random',
                tol=1e-4, algorithm='full', random_state=random_state)
kmeans.fit(X)
df['cluster_id'] = kmeans.labels_

In [4]:
df.head(10)

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class,cluster_id
0,3068,0.0024,0.17,16.12,0,Red,M,2
1,3042,0.0005,0.1542,16.6,0,Red,M,2
2,2600,0.0003,0.102,18.7,0,Red,M,2
3,2800,0.0002,0.16,16.65,0,Red,M,2
4,1939,0.000138,0.103,20.06,0,Red,M,2
5,2840,0.00065,0.11,16.98,0,Red,M,2
6,2637,0.00073,0.127,17.22,0,Red,M,2
7,2600,0.0004,0.096,17.4,0,Red,M,2
8,2650,0.00069,0.11,17.45,0,Red,M,2
9,2700,0.00018,0.13,16.05,0,Red,M,2


### 1. Take a feature, find the 95% confidence interval for its grand mean by using bootstrap

In [5]:
def bootstrap(data, K):
    data = np.asarray(data)
    N = len(data)
    means = []
    for _ in range(K):
        idxs = np.random.choice(N, N, replace=True)
        mean = data[idxs].mean()
        means.append(mean)
    return np.asarray(means)

In [6]:
def confidence_interval(means, pivotal=True):
    if pivotal is True:
        left  = means.mean() - 1.96 * means.std()
        right = means.mean() + 1.96 * means.std()
    else:
        left  = np.percentile(means, 2.5)
        right = np.percentile(means, 97.5)
    return sorted([abs(left), abs(right)])

In [7]:
print('mean: {:.2f}'.format(df['Temperature (K)'].mean()))
print('no-pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(bootstrap(df['Temperature (K)'], 1000), pivotal=False)))
print('pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(bootstrap(df['Temperature (K)'], 1000), pivotal=True)))

mean: 10497.46
no-pivotal: [9321.64, 11728.16]
pivotal: [9291.54, 11693.08]


### 2. Compare the within-cluster means for one of the features between two clusters using bootstrap

In [8]:
cluster_0 = df[df['cluster_id'] == 0]
cluster_1 = df[df['cluster_id'] == 1]

data = bootstrap(cluster_0['Temperature (K)'], 1000) - bootstrap(cluster_1['Temperature (K)'], 1000)

print('no-pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(data, pivotal=False)))
print('pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(data, pivotal=True)))

no-pivotal: [19221.00, 28277.35]
pivotal: [19222.36, 28084.46]


### 3. Take a cluster, and compare the grand mean with the within- cluster mean for the feature by using bootstrap

In [9]:
cluster_0 = df
cluster_1 = df[df['cluster_id'] == 1]

data = bootstrap(cluster_0['Temperature (K)'], 1000) - bootstrap(cluster_1['Temperature (K)'], 1000)

print('no-pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(data, pivotal=False)))
print('pivotal: [{:.2f}, {:.2f}]'.format(*confidence_interval(data, pivotal=True)))

no-pivotal: [13537.26, 22080.68]
pivotal: [13608.97, 22216.79]
