In [None]:
import random
import numpy as np
from scipy.spatial.distance import euclidean
from collections import defaultdict
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from itertools import combinations
import pandas as pd
# from sklearn.cluster import KMeans
# from sklearn.model_selection import train_test_split

In [None]:
"""
STEP 0 >> imports; def clean_census & other functions
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df

'''
STEP 1 >> load data, reset; make copies
'''
def load_copy_data(i):
    '''
    loads data
    
    input)
        >> i
            > if 0
                >> .reset_index() after deleting row contining column names
            > if 1
                >> do not .reset_index()
                
    '''
    if i==0:
        # load with reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')
    if i==1:
        # load without reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    
    # copy 
    # 2011 
    _y2k11 = twenty_eleven.copy()
    # 2012
    # _y2k12 = twenty_twelve.copy()
    #2013
    # _y2k13 = twenty_thirteen.copy()
    # 2014
    # _y2k14 = twenty_fourteen.copy()
    # 2015
    # _y2k15 = twenty_fifteen.copy()
    #2016
    # _y2k16 = twenty_sixteen.copy()
    #2017
    # _y2k17 = twenty_seventeen.copy()
    
    # output list of copied frames
    return _y2k11  # [_y2k11,_y2k12,_y2k13,_y2k14,_y2k15,_y2k16,_y2k17]


def test_non_unique(column_names):
    '''
    input) 
        >> list of column names {column_names}
            > columns to check for duplicate instances
    output)
        >> indexed list of names occouring more than once 
    '''
    # store first instance
    first_occour = []
    # store 2nd+ instance(s)
    non_unique = []
    # we're going to want index
    for i,_ in enumerate(column_names):
        # not first time
        if _ not in first_occour:
            first_occour.append(_)
        # if not first, tag&bag
        else:
            non_unique.append([i,_])
    # output index w/ non-first instances
    return non_unique


def to_numeric_but(save_these_columns,dataframe):
    '''
    split into 2 df and rejoin after convert to int
    
    inputs:
        >> save_these_columns=number of columns to save
            > currently must include one end of df 
                >> might could run function multiple times to edit slices
                >> single number, not range (yet)
        >> dataframe
            > dataframe to shif to numeric (but)
    output:
        >> concatted pd.DataFrame of 
            > og columns you chose to save
            > columns converted to numeric
    '''
    # copy df for editing
    k = dataframe.copy()

    # columns to save
    save_k = k[k.columns[:save_these_columns]]
    # columns to edit
    switch_k = k[k.columns[save_these_columns:]]

    # edited columns  # coerce , ignore , raise
    swapped_k = switch_k.apply(pd.to_numeric, errors='coerce')

    # new (edited) dataframe
    new_k = pd.concat([save_k,switch_k],axis=1)

    return new_k


def geography_to_zipcode_ids_to_numeric(dataframe):
    '''
    convert 
        >> .Geography values 
            > like 'ZCTA5 00601' 
            > to int(00601)
        >> .Id values
            > like '8600000US00601' 
            > to int(860000000601)
        >> .Id2 values
            > like '00601'
            > to int(00601)
    '''
    # copy
    df = dataframe.copy()
    
    # set old Geography
    geo = df.Geography
    # set old Id
    _id = df.Id
    # set old Id2
    __id2 = df.Id2
    
    # make new 'Geography' values
    new_geos = [int(i[-5:]) for i in geo]
    # new 'Id' values
    new_id = [int(''.join(i.split('US'))) for i in _id]
    # new .Id2 instances
    new__id2 = [int(d) for d in __id2]
    
    # convert dataframe
    new_df = df.copy()
    new_df.Geography = new_geos
    new_df.Id = new_id
    new_df.Id2 = new__id2
    
    # return new df
    return new_df

In [None]:
def k_means(X, k=5, max_iter=1000):
    """Performs k means

    Args:
    - X - feature matrix
    - k - number of clusters
    - max_iter - maximum iterations

    Returns:
    - clusters - dict mapping cluster centers to observations
    """
    centers = [tuple(pt) for pt in random.sample(list(X), k)]
    for i in range(max_iter):
        clusters = defaultdict(list)

        for datapoint in X:
            distances = [euclidean(datapoint, center) for center in centers]
            center = centers[np.argmin(distances)]
            clusters[center].append(datapoint)

        new_centers = []
        for center, pts in clusters.items():
            new_center = np.mean(pts, axis=0)
            new_centers.append(tuple(new_center))

        if set(new_centers) == set(centers):
            break

        centers = new_centers

    return clusters


def sse(clusters):
    """Sum squared euclidean distance of all points to their cluster center"""
    sum_squared_residuals = 0
    for center, pts in clusters.items():
        for pt in pts:
            sum_squared_residuals += euclidean(pt, center)**2
    return sum_squared_residuals


def plot_k_sse(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k

    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    sse_values = []
    for k in k_values:
        clusters = k_means(X, k=k)
        sse_values.append(sse(clusters))
    plt.plot(k_values, sse_values)
    plt.xlabel('k')
    plt.ylabel('sum squared error')
    plt.show()


def turn_clusters_into_labels(clusters):
    """Converts clusters dict returned by k_means into X, y (labels)

    Args:
    - clusters - dict mapping cluster centers to observations
    """
    labels = []
    new_X = []
    label = 0
    for cluster, pts in clusters.items():
        for pt in pts:
            new_X.append(pt)
            labels.append(label)
        label += 1
    return np.array(new_X), np.array(labels)


def plot_k_silhouette(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k

    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    """
    k_values = range(min_k, max_k+1)
    silhouette_scores = []
    for k in k_values:
        clusters = k_means(X, k=k)
        new_X, labels = turn_clusters_into_labels(clusters)
        silhouette_scores.append(silhouette_score(new_X, labels))

    plt.plot(k_values, silhouette_scores)
    plt.xlabel('k')
    plt.ylabel('silhouette score')
    plt.show()


def plot_all_2d(X, feature_names, k=3):
    """Generates all possible 2d plots of observations color coded by cluster ID"""
    pairs = list(combinations(range(X.shape[1]), 2))
    fig, axes = plt.subplots((len(pairs) // 2), 2)
    flattened_axes = [ax for ls in axes for ax in ls]

    for pair, ax in zip(pairs, flattened_axes):
        pair = np.array(pair)
        plot_data_2d(X[:, pair], feature_names[pair], ax, k=k)
    plt.show()


def plot_data_2d(X, plot_labels, ax, k=3):
    """Generates single 2d plot of observations color coded by cluster ID"""
    clusters = k_means(X, k=k)
    new_X, labels = turn_clusters_into_labels(clusters)
    ax.scatter(new_X[:, 0], new_X[:, 1], c=labels)
    ax.set_xlabel(plot_labels[0])
    ax.set_ylabel(plot_labels[1])


# if __name__ == '__main__':
#     iris = datasets.load_iris()
#     X = iris.data
#     plot_k_sse(X, 2, 10)
#     plot_k_silhouette(X, 2, 10)
#     plot_all_2d(X, np.array(iris.feature_names), k=5)
#     plt.close()

In [None]:
# load w/o reset
f = load_copy_data(1)
# copy for safeguard and hedge reload
frames = f.copy()
# extract copy of 2011 
y2k11 = frames.copy()
# examine (2011)
y2k11.info()
nuy11 = test_non_unique(y2k11)
len(nuy11)
# convert all but first 3 columns to numeric
data = y2k11.copy()
k2011 = to_numeric_but(save_these_columns=3,dataframe=data)
k2011.info()
nonuni = test_non_unique(k2011)
len(nonuni)
# now convert the first 3 columns
adjust_first_3 = k2011.copy()
_2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)
# drop nans
_2011df_ = _2011df_.dropnan(axis=1)  # , how='any'

# Convert DataFrame to matrix
mat = _2011df_.values
# Using sklearn
km = KMeans(n_clusters=5)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pandas.DataFrame([dataset.index,labels]).T