In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

"""
STEP 0 
>> imports; def clean_census & other functions
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df

'''
STEP 1 
>> load data, reset; make copies/**sample
'''

def load_clean_frames(i=0,n=False):
    '''
    function) loads data
    
    input)
        >> i
            > if 0
                >> .reset_index() after deleting row contining column names
            > if 1
                >> do not .reset_index()
        >> head
            > default=False (ignore)
            > if != False
                >> must be int
                    > dataframe = dataframe.head(n)
    '''
    if i==0:
        # load with reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
        # 2012
        twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
        #2013
        twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
        # 2014
        twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
        # 2015
        twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
        #2016
        twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
        #2017
        twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')
    if i==1:
        # load without reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
        # 2012
        twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
        #2013
        twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
        # 2014
        twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
        # 2015
        twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
        #2016
        twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
        #2017
        twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    
    # default
    if n==False:
        # copy 
        # 2011 
        _y2k11 = twenty_eleven.copy()
        # 2012
        _y2k12 = twenty_twelve.copy()
        #2013
        _y2k13 = twenty_thirteen.copy()
        # 2014
        _y2k14 = twenty_fourteen.copy()
        # 2015
        _y2k15 = twenty_fifteen.copy()
        #2016
        _y2k16 = twenty_sixteen.copy()
        #2017
        _y2k17 = twenty_seventeen.copy()
        
    # non default, want only first n rows
    if n:
        # adjust frames to .head(n) 
        # 2011 
        _y2k11 = twenty_eleven.copy().head(n)
        # 2012
        _y2k12 = twenty_twelve.copy().head(n)
        #2013
        _y2k13 = twenty_thirteen.copy().head(n)
        # 2014
        _y2k14 = twenty_fourteen.copy().head(n)
        # 2015
        _y2k15 = twenty_fifteen.copy().head(n)
        #2016
        _y2k16 = twenty_sixteen.copy().head(n)
        #2017
        _y2k17 = twenty_seventeen.copy().head(n)
    
    # output list of copied frames
    return [_y2k11,_y2k12,_y2k13,_y2k14,_y2k15,_y2k16,_y2k17]

'''
STEP 2 
>> identify unique (mostly used in testing); 
>> convert DataFrame to numeric; convert Geography (Zip Codes) && Ids
'''

def test_non_unique(column_names):
    '''
    input) 
        >> list of column names {column_names}
            > columns to check for duplicate instances
    output)
        >> indexed list of names occouring more than once 
    '''
    # store first instance
    first_occour = []
    # store 2nd+ instance(s)
    non_unique = []
    # we're going to want index
    for i,_ in enumerate(column_names):
        # not first time
        if _ not in first_occour:
            first_occour.append(_)
        # if not first, tag&bag
        else:
            non_unique.append([i,_])
    # output index w/ non-first instances
    return non_unique


def to_numeric_but(dataframe,save_these_columns='none',e='coerce'):
    '''
    split into 2 df and rejoin after convert to int
    
    inputs:
        >> save_these_columns=number of columns to save
            > currently must include one end of df 
                >> might could run function multiple times to edit slices
                >> single number, not range (yet)
                    > if 'none', saves no columns
        >> dataframe
            > dataframe to shif to numeric (but)
        >> e
            > for pd.to_numeric, errors=e
    output:
        >> concatted pd.DataFrame of 
            > og columns you chose to save
            > columns converted to numeric
    '''
    # copy df for editing
    k = dataframe.copy()
    
    # split
    if save_these_columns != 'none':
        # columns to save
        save_k = k[k.columns[:save_these_columns]]
        # columns to edit
        switch_k = k[k.columns[save_these_columns:]]
    # don't split
    else:
        # k as is
        switch_k = k

    # edited columns  # coerce , ignore , raise
    swapped_k = switch_k.apply(pd.to_numeric, errors=e)
    
    # check saving columns
    if save_these_columns != 'none':
        # new (edited) dataframe (ogsave|swapped)
        new_k = pd.concat( [save_k,swapped_k] ,axis=1 )
    else:
        new_k = swapped_k

    return new_k


def geography_to_zipcode_ids_to_numeric(dataframe):
    '''
    convert 
        >> .Geography values 
            > like 'ZCTA5 00601' 
            > to int(00601)
        >> .Id values
            > like '8600000US00601' 
            > to int(860000000601)
        >> .Id2 values
            > like '00601'
            > to int(00601)
    '''
    # copy
    df = dataframe.copy()
    
    # set old Geography
    geo = df.Geography
    # set old Id
    _id = df.Id
    # set old Id2
    __id2 = df.Id2
    
    # make new 'Geography' values
    new_geos = [int(i[-5:]) for i in geo]
    # new 'Id' values
    new_id = [int(''.join(i.split('US'))) for i in _id]
    # new .Id2 instances
    new__id2 = [int(d) for d in __id2]
    
    # convert dataframe
    new_df = df.copy()
    new_df.Geography = new_geos
    new_df.Id = new_id
    new_df.Id2 = new__id2
    
    # return new df
    return new_df

'''
STEP 3
>> run KMeans on dataframe
'''

def kmeans_by(dataframe,n_clusters=10,converted=False):
    '''
    inputs:
        >> dataframe
            > dataframe to be edited
        >> n_clusters 
            > default = 10
            > number of clusters for KMeans
        >> converted
            > default = False
            > assumes data is not ready for KMeans 
                >> if True, assumes df is ready for KMeans
    output:
        > pd.Dataframe of 
    '''
    # copy data 
    d = dataframe.copy()  
    
    '''df conversion'''
    # default
    if converted!=True:
        # copy data for editing
        _data_ = d.copy()
        
        # convert first 3 columns ('Id', 'Id2', 'Geography')
        _data = geography_to_zipcode_ids_to_numeric(dataframe=_data_)
        
        # convert remainder of dataframe
        data = to_numeric_but(save_these_columns='none', dataframe=_data)
        print(len(data),len(data.columns))

    # dataframe has already been converted / otherwise
    if converted==True:
        data = d
    
    '''KMeans'''
    # fill NaN values
    t = data.copy().fillna(0)
    
    # Convert DataFrame to matrix
    mat = t.values
    
    # Using sklearn
    km = KMeans(n_clusters)
    # fit our values
    km.fit(mat)
    
    # Get cluster assignment labels
    labels = km.labels_
    
    # Format results as a DataFrame
    results = pd.DataFrame([t.index,labels])

    # display results
    return results

In [1]:
import random
import numpy as np
from scipy.spatial.distance import euclidean
from collections import defaultdict
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from itertools import combinations

In [2]:
def k_means(X, k=5, max_iter=1000):
    """Performs k means
    Args:
    - X - feature matrix
    - k - number of clusters
    - max_iter - maximum iterations
    Returns:
    - clusters - dict mapping cluster centers to observations
    """
    centers = [tuple(pt) for pt in random.sample(list(X), k)]
    for i in range(max_iter):
        clusters = defaultdict(list)

        for datapoint in X:
            distances = [euclidean(datapoint, center) for center in centers]
            center = centers[np.argmin(distances)]
            clusters[center].append(datapoint)

        new_centers = []
        for center, pts in clusters.items():
            new_center = np.mean(pts, axis=0)
            new_centers.append(tuple(new_center))

        if set(new_centers) == set(centers):
            break

        centers = new_centers

    return clusters


def sse(clusters):
    """Sum squared euclidean distance of all points to their cluster center"""
    sum_squared_residuals = 0
    for center, pts in clusters.items():
        for pt in pts:
            sum_squared_residuals += euclidean(pt, center)**2
    return sum_squared_residuals


def plot_k_sse(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    sse_values = []
    for k in k_values:
        clusters = k_means(X, k=k)
        # sum squared euclidean; i>>c
        sse_values.append(sse(clusters))
    plt.plot(k_values, sse_values)
    plt.xlabel('k')
    plt.ylabel('sum squared error')
    plt.show()


def turn_clusters_into_labels(clusters):
    """Converts clusters dict returned by k_means into X, y (labels)
    Args:
    - clusters - dict mapping cluster centers to observations
    """
    labels = []
    new_X = []
    label = 0
    for cluster, pts in clusters.items():
        for pt in pts:
            new_X.append(pt)
            labels.append(label)
        label += 1
    return np.array(new_X), np.array(labels)


def plot_k_silhouette(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    silhouette_scores = []
    for k in k_values:
        clusters = k_means(X, k=k)
        new_X, labels = turn_clusters_into_labels(clusters)
        silhouette_scores.append(silhouette_score(new_X, labels))

    plt.plot(k_values, silhouette_scores)
    plt.xlabel('k')
    plt.ylabel('silhouette score')
    plt.show()


def plot_all_2d(X, feature_names, k=3):
    """Generates all possible 2d plots of observations color coded by cluster ID"""
    pairs = list(combinations(range(X.shape[1]), 2))
    fig, axes = plt.subplots((len(pairs) // 2), 2)
    flattened_axes = [ax for ls in axes for ax in ls]

    for pair, ax in zip(pairs, flattened_axes):
        pair = np.array(pair)
        plot_data_2d(X[:, pair], feature_names[pair], ax, k=k)
    plt.show()


def plot_data_2d(X, plot_labels, ax, k=3):
    """Generates single 2d plot of observations color coded by cluster ID"""
    clusters = k_means(X, k=k)
    new_X, labels = turn_clusters_into_labels(clusters)
    ax.scatter(new_X[:, 0], new_X[:, 1], c=labels)
    ax.set_xlabel(plot_labels[0])
    ax.set_ylabel(plot_labels[1])


# if __name__ == '__main__':
#     iris = datasets.load_iris()
#     X = iris.data
#     plot_k_sse(X, 2, 10)
#     plot_k_silhouette(X, 2, 10)
#     plot_all_2d(X, np.array(iris.feature_names), k=5)
#     plt.close()

In [None]:
# load first 10,000 rows w/o reset
f = load_clean_frames( i=1  )

In [None]:
for h in f:
    print(h.info(),'\n')

In [None]:
p = 33120 /1000
p,int(p)

In [None]:
# store out
out = []

for i in range(len(f)):
    z = kmeans_by( dataframe=f[i] , n_clusters=int(p) )
    # df, zipcode instance, cluster
    out.append(z)

In [None]:
# 2011 
x = out[0]

In [None]:
x

In [None]:
x.items()

In [None]:
def sse(clusters):
    """Sum squared euclidean distance of all points to their cluster center"""
    sum_squared_residuals = 0
    for center, pts in clusters.items():
        for pt in pts:
            sum_squared_residuals += euclidean(pt, center)**2
    return sum_squared_residuals

In [None]:
sse(x)

In [None]:
# all
for i in out:
    print(sse(i))
    
# n=10000 , 10 clusters
# 330034929.0
# 329073883.0
# 330082783.0
# 327515094.0
# 330612055.0
# 331199375.0
# 327962433.0

In [None]:
# load w/o reset
f = load_clean_frames(1)

In [None]:
# copy for safeguard and hedge reload
frames = f.copy()

In [None]:
# extract copy of 2011 
y2k11 = frames.copy()  #[0]

In [None]:
# examine (2011)
y2k11.info()

In [None]:
nuy11 = test_non_unique(y2k11)
len(nuy11)

In [None]:
# convert all but first 3 columns to numeric
data = y2k11.copy()
k2011 = to_numeric_but(save_these_columns=3,dataframe=data)

In [None]:
k2011.info()

In [None]:
nonuni = test_non_unique(k2011)
len(nonuni)

In [None]:
nuy11[:2] , nonuni[:2]

In [None]:
len(set(k2011.columns[:10]))

In [None]:
for i in k2011.columns:
    if i not in y2k11.columns:
        print(i)

In [None]:
# now convert the first 3 columns
adjust_first_3 = k2011.copy()
_2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)

In [None]:
# rand_test = _2011df_.copy()
# rand_test['Margin of Error; SEX AND AGE - Total population'].apply(lambda x : 0 if x =='*****' else x)

In [None]:
_2011df_.info()

In [None]:
# _2011df_ = _2011df_.dropna()  # , how='any'

In [None]:
# no change
_2011df_.info()

In [None]:
# _2011df_.head(15)
# _2011df_ = _2011df_.apply(pd.to_numeric, errors='coerce')

In [None]:
# _2011df_['Margin of Error; SEX AND AGE - Total population'].apply(lambda x : 0 for x in i if x =='*****' else x)

In [None]:
x=-1
x_out=set()
star_out=set()
other_out=set()
for i in _2011df_.sample(100,axis=0).values:
    x+=1
    y=0
    for j in i:
        y+=1
        if j == '(X)':
            x_out.add((x-1,y))
        if j == '*****':
            star_out.add((x-1,y))
        if j == '**':
            star_out.add((x-1,y))

In [None]:
x_out

In [None]:
star_out

In [None]:
other_out

In [None]:
collectj = []
for i,j in x_out: 
    collectj.append(j)

In [None]:
collecti = []
for j,i in star_out: 
    collecti.append(i)


In [None]:
count_out=[]
for n in set(collectj):
    q = collectj.count(n)
    count_out.append((n,q))

In [None]:
count_in=[]
for m in set(collecti):
    p = collectj.count(m)
    count_in.append((m,p))

In [None]:
c = pd.DataFrame(data=count_out,columns=['column','count'])
c

In [None]:
r = pd.DataFrame(data=count_in,columns=['row','count'])
r


In [None]:
test = _2011df_.copy()

In [None]:
cols_to_drop = [_2011df_.columns[q-1] for q,b in count_out]
stars = [_2011df_.columns[q-1] for q,b in count_out]

In [None]:
len(cols_to_drop), len(stars)

In [None]:
# convert '*****' to 0
for i in stars: 
    # test.drop(stars,axis=1)
    test[i].apply(lambda x : 0 if x == '*****' else x)
# drop columns with '(X)'
# for cols in 
test = test.drop(cols_to_drop,axis=1)

In [None]:
test.info()  #['Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units']

In [None]:
# # ivalues = [i for i in test.values]
# # # [i for i in ivalues if i == '**']
# # for i in range(len(test.values)):
# #     if test.values[i].any() == '**':
# #         print(i,test.values[i])
# # sus=set()
# # for i in test:
# #     for x in test[i]:
# #         if x == '**':
# #             sus.add(i)

# for i in test.columns:
#     test[i].apply(lambda q: 0 if x == '**' else x)

- ***notes***:
    - that wasn't too hard
- ***actions***:
    - drop these columns

In [None]:
t = test.copy().fillna(0)
# Convert DataFrame to matrix
mat = t.values
# Using sklearn
km = KMeans(n_clusters=5)
# fit our values
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame([t.index,labels]).T

In [None]:
results

In [None]:
# copy df for editing
k11 = y2k11.copy()

'''split into 2 df and rejoin after convert to int'''
# df to save
save_k11 = k11.copy()
# columns to save
save_k11 = save_k11.copy()[save_k11.columns[:3]]

# df to edit
switch_k11 = k11.copy()
# columns to edit
switch_k11 = switch_k11.copy()[switch_k11.columns[3:]]

# edited columns
swapped_k11 = switch_k11.copy().apply(pd.to_numeric, errors='ignore')

# new (edited) dataframe
new_k11 = pd.concat([save_k11,swapped_k11],axis=1)

len(new_k11.columns)

In [None]:
# # pull column names
# k11_cols = y2k11.copy().columns[4:]
# for c in range(len(k11_cols)):
#     a = pd.Series(k11[k11_cols])
# #     k11.loc[[k11_cols][c]] = pd.to_numeric(a,errors='ignore')
# # pd.Series(k11[k11_cols[0]])

In [None]:
k11_cols = y2k11.copy().columns
__k11_cols__ = y2k11.copy().columns[4:]
# len(k11_cols),len(__k11_cols__)
print(f'{__k11_cols__[:3]}\n{k11_cols[:3]}')

In [None]:
for frame in frames:
    print(frame.info(),'\n')

In [None]:
copies = frames[:2].copy()
for df in copies:
    for column in df.columns: 
        df[column] = pd.to_numeric([df[column]], errors='ignore')

- ***NOTE***:
    - path to mvp
        - whiteboard_pics/acs_5yr_11-17_path-to-mvp.jpg

In [None]:
dataset_array = frames[0].values
dataset_array

In [None]:
'''
STEP 2 >> find all columns which coexist across all dataframes at current position
'''
columns_by_frame = [frame.columns for frame in frames]
count_columns_by_frame = [len(frame) for frame in columns_by_frame]

In [None]:
out = []
# for range of df with most columns
for count in range(len(max(count_columns_by_frame))):
    # if index of every frame is same as index of frame with most columns
    if [frame for frame in columns_by_frame][count] == frames[6][count]:
        out.append(count)
out

In [None]:
# collect all
years = frames  # [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
for year in years:
    print(len(year.columns),'\n',year.info(),'\n\n')

In [None]:
# X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])

# kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
# kmeans.labels_

# # array([1, 1, 1, 0, 0, 0], dtype=int32)
# kmeans.predict([[0, 0], [12, 3]])
# # array([1, 0], dtype=int32)
# kmeans.cluster_centers_
# # array([[10.,  2.], [ 1.,  2.]])

In [None]:
# # Scikit learn plays really well with Pandas, so I suggest you use it. Here's an example:

# # In [1]: 
# # import pandas as pd
# # import numpy as np
# # from sklearn.cross_validation import train_test_split
# data = np.reshape(np.random.randn(20),(10,2)) # 10 training examples
# labels = np.random.randint(2, size=10) # 10 labels

# # In [2]: 
# X = pd.DataFrame(data)
# y = pd.Series(labels)

# # In [3]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                     random_state=0)

# # In [4]: X_test
# # Out[4]:

# #      0       1
# # 2   -1.39   -1.86
# # 8    0.48   -0.81
# # 4   -0.10   -1.83

# # In [5]: y_test
# # Out[5]:

# # 2    1
# # 8    1
# # 4    1

In [None]:
pizfsdazapizzaadsf = (2,1,0,4,32,7,2,9,5)
max(pizfsdazapizzaadsf)

In [None]:
x=[3,2,1,4,6,5]
print(x)

In [None]:
# copy data for editing
data = frames[1].copy()
len(data.columns)

In [None]:
# _2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)
non_copy = geography_to_zipcode_ids_to_numeric(data)
# len(_2011df_.columns)
len(non_copy.columns)

In [None]:
# convert all but first 3 columns to numeric
k2012 = to_numeric_but(save_these_columns=3,dataframe=data)
# len(k2012.columns)
# k2012 = to_numeric_but(save_these_columns=3,dataframe=data)
len(k2012.columns), len(non_copy.apply(pd.to_numeric,errors='coerce').columns)
# (407, 327)

- ***notes***:
    - above indicates 
        - may be better to convert first 3 columns first
        - then wouldn't have to save
- ***actions***:
    - reeval to_numeric_but
    - get this whole 407 columns thing back to 327 
    
```
# reeval to_numeric_but
f = load_clean_frames(i=1,n=1000)
# 2012
twelve = f[1].copy()
print(len(twelve.columns))
t = geography_to_zipcode_ids_to_numeric(twelve)
print(len(t.columns))
x = to_numeric_but(save_these_columns='none',dataframe=t)
print(len(x.columns))
>>327
>>327
>>327
```
- ***conclusion***:
    - swapped order in to_numeric_but
    - defaulted save_these_columns='none' argument (in to_numeric_but)

In [None]:
import random
import numpy as np
from scipy.spatial.distance import euclidean
from collections import defaultdict
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from itertools import combinations

def k_means(X, k=5, max_iter=1000):
    """Performs k means
    Args:
    - X - feature matrix
    - k - number of clusters
    - max_iter - maximum iterations
    Returns:
    - clusters - dict mapping cluster centers to observations
    """
    centers = [tuple(pt) for pt in random.sample(list(X), k)]
    for i in range(max_iter):
        clusters = defaultdict(list)

        for datapoint in X:
            distances = [euclidean(datapoint, center) for center in centers]
            center = centers[np.argmin(distances)]
            clusters[center].append(datapoint)

        new_centers = []
        for center, pts in clusters.items():
            new_center = np.mean(pts, axis=0)
            new_centers.append(tuple(new_center))

        if set(new_centers) == set(centers):
            break

        centers = new_centers

    return clusters


def sse(clusters):
    """Sum squared euclidean distance of all points to their cluster center"""
    sum_squared_residuals = 0
    for center, pts in clusters.items():
        for pt in pts:
            sum_squared_residuals += euclidean(pt, center)**2
    return sum_squared_residuals


def plot_k_sse(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    sse_values = []
    for k in k_values:
        clusters = k_means(X, k=k)
        # sum squared euclidean; i>>c
        sse_values.append(sse(clusters))
    plt.plot(k_values, sse_values)
    plt.xlabel('k')
    plt.ylabel('sum squared error')
    plt.show()


def turn_clusters_into_labels(clusters):
    """Converts clusters dict returned by k_means into X, y (labels)
    Args:
    - clusters - dict mapping cluster centers to observations
    """
    labels = []
    new_X = []
    label = 0
    for cluster, pts in clusters.items():
        for pt in pts:
            new_X.append(pt)
            labels.append(label)
        label += 1
    return np.array(new_X), np.array(labels)


def plot_k_silhouette(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    silhouette_scores = []
    for k in k_values:
        clusters = k_means(X, k=k)
        new_X, labels = turn_clusters_into_labels(clusters)
        silhouette_scores.append(silhouette_score(new_X, labels))

    plt.plot(k_values, silhouette_scores)
    plt.xlabel('k')
    plt.ylabel('silhouette score')
    plt.show()


def plot_all_2d(X, feature_names, k=3):
    """Generates all possible 2d plots of observations color coded by cluster ID"""
    pairs = list(combinations(range(X.shape[1]), 2))
    fig, axes = plt.subplots((len(pairs) // 2), 2)
    flattened_axes = [ax for ls in axes for ax in ls]

    for pair, ax in zip(pairs, flattened_axes):
        pair = np.array(pair)
        plot_data_2d(X[:, pair], feature_names[pair], ax, k=k)
    plt.show()


def plot_data_2d(X, plot_labels, ax, k=3):
    """Generates single 2d plot of observations color coded by cluster ID"""
    clusters = k_means(X, k=k)
    new_X, labels = turn_clusters_into_labels(clusters)
    ax.scatter(new_X[:, 0], new_X[:, 1], c=labels)
    ax.set_xlabel(plot_labels[0])
    ax.set_ylabel(plot_labels[1])


if __name__ == '__main__':
    iris = datasets.load_iris()
    X = iris.data
    plot_k_sse(X, 2, 10)
    plot_k_silhouette(X, 2, 10)
    plot_all_2d(X, np.array(iris.feature_names), k=5)
    plt.close()




In [None]:
iris = datasets.load_iris()
iris.data