In [12]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

"""
STEP 0 >> imports; def clean_census & other functions
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df

'''
STEP 1 >> load data, reset; make copies
'''
def load_copy_data(i):
    '''
    loads data
    
    input)
        >> i
            > if 0
                >> .reset_index() after deleting row contining column names
            > if 1
                >> do not .reset_index()
                
    '''
    if i==0:
        # load with reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')
    if i==1:
        # load without reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    
    # copy 
    # 2011 
    _y2k11 = twenty_eleven.copy()
    # 2012
    # _y2k12 = twenty_twelve.copy()
    #2013
    # _y2k13 = twenty_thirteen.copy()
    # 2014
    # _y2k14 = twenty_fourteen.copy()
    # 2015
    # _y2k15 = twenty_fifteen.copy()
    #2016
    # _y2k16 = twenty_sixteen.copy()
    #2017
    # _y2k17 = twenty_seventeen.copy()
    
    # output list of copied frames
    return _y2k11  # [_y2k11,_y2k12,_y2k13,_y2k14,_y2k15,_y2k16,_y2k17]


def test_non_unique(column_names):
    '''
    input) 
        >> list of column names {column_names}
            > columns to check for duplicate instances
    output)
        >> indexed list of names occouring more than once 
    '''
    # store first instance
    first_occour = []
    # store 2nd+ instance(s)
    non_unique = []
    # we're going to want index
    for i,_ in enumerate(column_names):
        # not first time
        if _ not in first_occour:
            first_occour.append(_)
        # if not first, tag&bag
        else:
            non_unique.append([i,_])
    # output index w/ non-first instances
    return non_unique


def to_numeric_but(save_these_columns,dataframe):
    '''
    split into 2 df and rejoin after convert to int
    
    inputs:
        >> save_these_columns=number of columns to save
            > currently must include one end of df 
                >> might could run function multiple times to edit slices
                >> single number, not range (yet)
        >> dataframe
            > dataframe to shif to numeric (but)
    output:
        >> concatted pd.DataFrame of 
            > og columns you chose to save
            > columns converted to numeric
    '''
    # copy df for editing
    k = dataframe.copy()

    # columns to save
    save_k = k[k.columns[:save_these_columns]]
    # columns to edit
    switch_k = k[k.columns[save_these_columns:]]

    # edited columns  # coerce , ignore , raise
    swapped_k = switch_k.apply(pd.to_numeric, errors='coerce')

    # new (edited) dataframe
    new_k = pd.concat([save_k,switch_k],axis=1)

    return new_k


def geography_to_zipcode_ids_to_numeric(dataframe):
    '''
    convert 
        >> .Geography values 
            > like 'ZCTA5 00601' 
            > to int(00601)
        >> .Id values
            > like '8600000US00601' 
            > to int(860000000601)
        >> .Id2 values
            > like '00601'
            > to int(00601)
    '''
    # copy
    df = dataframe.copy()
    
    # set old Geography
    geo = df.Geography
    # set old Id
    _id = df.Id
    # set old Id2
    __id2 = df.Id2
    
    # make new 'Geography' values
    new_geos = [int(i[-5:]) for i in geo]
    # new 'Id' values
    new_id = [int(''.join(i.split('US'))) for i in _id]
    # new .Id2 instances
    new__id2 = [int(d) for d in __id2]
    
    # convert dataframe
    new_df = df.copy()
    new_df.Geography = new_geos
    new_df.Id = new_id
    new_df.Id2 = new__id2
    
    # return new df
    return new_df

In [13]:
# load w/o reset
f = load_copy_data(1)

In [14]:
# copy for safeguard and hedge reload
frames = f.copy()

In [15]:
# extract copy of 2011 
y2k11 = frames.copy()  #[0]

In [16]:
# examine (2011)
y2k11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 327 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: object(327)
memory usage: 82.6+ MB


In [25]:
nuy11 = test_non_unique(y2k11)
len(nuy11)

32

In [18]:
# convert all but first 3 columns to numeric
data = y2k11.copy()
k2011 = to_numeric_but(save_these_columns=3,dataframe=data)

In [19]:
k2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: object(407)
memory usage: 102.8+ MB


In [26]:
nonuni = test_non_unique(k2011)
len(nonuni)

112

In [34]:
nuy11[:2] , nonuni[:2]

([[87, 'Estimate; SEX AND AGE - 18 years and over'],
  [88, 'Margin of Error; SEX AND AGE - 18 years and over']],
 [[8, 'Estimate; SEX AND AGE - Male'], [9, 'Estimate; SEX AND AGE - Male']])

In [54]:
len(set(k2011.columns[:10]))

9

In [11]:
for i in k2011.columns:
    if i not in y2k11.columns:
        print(i)

In [55]:
# now convert the first 3 columns
adjust_first_3 = k2011.copy()
_2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)

In [56]:
_2011df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: int64(3), object(404)
memory usage: 102.8+ MB


In [60]:
_2011df_ = _2011df_.dropna(axis=1)  # , how='any'

AttributeError: 'DataFrame' object has no attribute 'dropnull'

In [58]:
# no change, likely due to 'coerce' instead of 'ignore'
_2011df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: int64(3), object(404)
memory usage: 102.8+ MB


In [63]:
_2011df_.head(100)

Unnamed: 0,Id,Id2,Geography,Estimate; SEX AND AGE - Total population,Margin of Error; SEX AND AGE - Total population,Percent; SEX AND AGE - Total population,Percent Margin of Error; SEX AND AGE - Total population,Estimate; SEX AND AGE - Male,Estimate; SEX AND AGE - Male.1,Estimate; SEX AND AGE - Male.2,...,Percent; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races including Some other race,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races including Some other race,"Estimate; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races",Estimate; HISPANIC OR LATINO AND RACE - Total housing units,Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units,Percent; HISPANIC OR LATINO AND RACE - Total housing units,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
1,860000000601,601,601,18533,310,18533,(X),8971,6495,1080,...,0.0,0.2,0,92,0.0,0.2,6503,164,(X),(X)
2,860000000602,602,602,41930,136,41930,(X),20568,15453,2395,...,0.0,0.1,228,125,0.5,0.3,16336,225,(X),(X)
3,860000000603,603,603,54475,839,54475,(X),26588,19831,3725,...,0.0,0.1,250,130,0.5,0.2,23245,334,(X),(X)
4,860000000606,606,606,6386,291,6386,(X),3085,2206,337,...,0.0,0.5,0,92,0.0,0.5,2373,111,(X),(X)
5,860000000610,610,610,29111,173,29111,(X),14162,10512,1880,...,0.0,0.1,0,92,0.0,0.1,11308,147,(X),(X)
6,860000000612,612,612,70541,1436,70541,(X),33449,24810,4914,...,0.0,0.1,40,46,0.1,0.1,29575,452,(X),(X)
7,860000000616,616,616,10617,866,10617,(X),5166,3723,804,...,0.0,0.3,3,6,0.0,0.1,4466,264,(X),(X)
8,860000000617,617,617,24458,261,24458,(X),11752,8535,1537,...,0.0,0.1,17,28,0.1,0.1,9627,192,(X),(X)
9,860000000622,622,622,5419,854,5419,(X),2392,2030,596,...,0.0,0.6,0,92,0.0,0.6,7042,351,(X),(X)
10,860000000623,623,623,45314,854,45314,(X),21904,16161,3346,...,0.0,0.1,0,92,0.0,0.1,19997,390,(X),(X)


In [98]:
# for i,j in _2011df_.sample(5,axis=0):
#     print(row)
x=0
for i in _2011df_.sample(1000,axis=0).values:
    x+=1
    y=0
    for j in i:
        y+=1
        if j == '(X)':
            print(x,y)

1 7
1 86
1 87
1 95
1 111
1 119
1 151
1 179
1 315
1 343
1 406
1 407
2 7
2 86
2 87
2 95
2 111
2 119
2 151
2 179
2 315
2 343
2 406
2 407
3 7
3 86
3 87
3 95
3 111
3 119
3 151
3 179
3 315
3 343
3 406
3 407
4 7
4 86
4 87
4 95
4 111
4 119
4 151
4 179
4 315
4 343
4 406
4 407
5 7
5 86
5 87
5 95
5 111
5 119
5 151
5 179
5 315
5 343
5 406
5 407
6 7
6 86
6 87
6 95
6 111
6 119
6 151
6 179
6 315
6 343
6 406
6 407
7 7
7 86
7 87
7 95
7 111
7 119
7 151
7 179
7 315
7 343
7 406
7 407
8 7
8 86
8 87
8 95
8 111
8 119
8 151
8 179
8 315
8 343
8 406
8 407
9 7
9 86
9 87
9 95
9 111
9 119
9 151
9 179
9 315
9 343
9 406
9 407
10 7
10 86
10 87
10 95
10 111
10 119
10 151
10 179
10 315
10 343
10 406
10 407
11 7
11 86
11 87
11 95
11 111
11 119
11 151
11 179
11 315
11 343
11 406
11 407
12 7
12 86
12 87
12 95
12 111
12 119
12 151
12 179
12 315
12 343
12 406
12 407
13 7
13 86
13 87
13 95
13 111
13 119
13 151
13 179
13 315
13 343
13 406
13 407
14 7
14 86
14 87
14 95
14 111
14 119
14 151
14 179
14 315
14 343
14 406
14 407
15

131 343
131 406
131 407
132 7
132 86
132 87
132 95
132 111
132 119
132 151
132 179
132 315
132 343
132 406
132 407
133 7
133 86
133 87
133 95
133 111
133 119
133 151
133 179
133 315
133 343
133 406
133 407
134 7
134 86
134 87
134 95
134 111
134 119
134 151
134 179
134 315
134 343
134 406
134 407
135 7
135 86
135 87
135 95
135 111
135 119
135 151
135 179
135 315
135 343
135 406
135 407
136 7
136 86
136 87
136 95
136 111
136 119
136 151
136 179
136 315
136 343
136 406
136 407
137 7
137 86
137 87
137 95
137 111
137 119
137 151
137 179
137 315
137 343
137 406
137 407
138 7
138 86
138 87
138 95
138 111
138 119
138 151
138 179
138 315
138 343
138 406
138 407
139 7
139 86
139 87
139 95
139 111
139 119
139 151
139 179
139 315
139 343
139 406
139 407
140 7
140 86
140 87
140 95
140 111
140 119
140 151
140 179
140 315
140 343
140 406
140 407
141 7
141 86
141 87
141 95
141 111
141 119
141 151
141 179
141 315
141 343
141 406
141 407
142 7
142 86
142 87
142 95
142 111
142 119
142 151
142 179
142 315

252 111
252 119
252 151
252 179
252 315
252 343
252 406
252 407
253 7
253 86
253 87
253 95
253 111
253 119
253 151
253 179
253 315
253 343
253 406
253 407
254 7
254 86
254 87
254 95
254 111
254 119
254 151
254 179
254 315
254 343
254 406
254 407
255 7
255 86
255 87
255 95
255 111
255 119
255 151
255 179
255 315
255 343
255 406
255 407
256 7
256 86
256 87
256 95
256 111
256 119
256 151
256 179
256 315
256 343
256 406
256 407
257 7
257 86
257 87
257 95
257 111
257 119
257 151
257 179
257 315
257 343
257 406
257 407
258 7
258 86
258 87
258 95
258 111
258 119
258 151
258 179
258 315
258 343
258 406
258 407
259 7
259 86
259 87
259 95
259 111
259 119
259 151
259 179
259 315
259 343
259 406
259 407
260 7
260 86
260 87
260 95
260 111
260 119
260 151
260 179
260 315
260 343
260 406
260 407
261 7
261 86
261 87
261 95
261 111
261 119
261 151
261 179
261 315
261 343
261 406
261 407
262 7
262 86
262 87
262 95
262 111
262 119
262 151
262 179
262 315
262 343
262 406
262 407
263 7
263 86
263 87
263 95

376 179
376 315
376 343
376 406
376 407
377 7
377 86
377 87
377 95
377 111
377 119
377 151
377 179
377 315
377 343
377 406
377 407
378 7
378 86
378 87
378 95
378 111
378 119
378 151
378 179
378 315
378 343
378 406
378 407
379 7
379 86
379 87
379 95
379 111
379 119
379 151
379 179
379 315
379 343
379 406
379 407
380 7
380 86
380 87
380 95
380 111
380 119
380 151
380 179
380 315
380 343
380 406
380 407
381 7
381 86
381 87
381 95
381 111
381 119
381 151
381 179
381 315
381 343
381 406
381 407
382 7
382 86
382 87
382 95
382 111
382 119
382 151
382 179
382 315
382 343
382 406
382 407
383 7
383 86
383 87
383 95
383 111
383 119
383 151
383 179
383 315
383 343
383 406
383 407
384 7
384 86
384 87
384 95
384 111
384 119
384 151
384 179
384 315
384 343
384 406
384 407
385 7
385 86
385 87
385 95
385 111
385 119
385 151
385 179
385 315
385 343
385 406
385 407
386 7
386 86
386 87
386 95
386 111
386 119
386 151
386 179
386 315
386 343
386 406
386 407
387 7
387 86
387 87
387 95
387 111
387 119
387 151

492 407
493 7
493 86
493 87
493 95
493 111
493 119
493 151
493 179
493 315
493 343
493 406
493 407
494 7
494 86
494 87
494 95
494 111
494 119
494 151
494 179
494 315
494 343
494 406
494 407
495 7
495 86
495 87
495 95
495 111
495 119
495 151
495 179
495 315
495 343
495 406
495 407
496 7
496 86
496 87
496 95
496 111
496 119
496 151
496 179
496 315
496 343
496 406
496 407
497 7
497 86
497 87
497 95
497 111
497 119
497 151
497 179
497 315
497 343
497 406
497 407
498 7
498 86
498 87
498 95
498 111
498 119
498 151
498 179
498 315
498 343
498 406
498 407
499 7
499 86
499 87
499 95
499 111
499 119
499 151
499 179
499 315
499 343
499 406
499 407
500 7
500 86
500 87
500 95
500 111
500 119
500 151
500 179
500 315
500 343
500 406
500 407
501 7
501 86
501 87
501 95
501 111
501 119
501 151
501 179
501 315
501 343
501 406
501 407
502 7
502 86
502 87
502 95
502 111
502 119
502 151
502 179
502 315
502 343
502 406
502 407
503 7
503 86
503 87
503 95
503 111
503 119
503 151
503 179
503 315
503 343
503 406

619 407
620 7
620 86
620 87
620 95
620 111
620 119
620 151
620 179
620 315
620 343
620 406
620 407
621 7
621 86
621 87
621 95
621 111
621 119
621 151
621 179
621 315
621 343
621 406
621 407
622 7
622 86
622 87
622 95
622 111
622 119
622 151
622 179
622 315
622 343
622 406
622 407
623 7
623 86
623 87
623 95
623 111
623 119
623 151
623 179
623 315
623 343
623 406
623 407
624 7
624 86
624 87
624 95
624 111
624 119
624 151
624 179
624 315
624 343
624 406
624 407
625 7
625 86
625 87
625 95
625 111
625 119
625 151
625 179
625 315
625 343
625 406
625 407
626 7
626 86
626 87
626 95
626 111
626 119
626 151
626 179
626 315
626 343
626 406
626 407
627 7
627 86
627 87
627 95
627 111
627 119
627 151
627 179
627 315
627 343
627 406
627 407
628 7
628 86
628 87
628 95
628 111
628 119
628 151
628 179
628 315
628 343
628 406
628 407
629 7
629 86
629 87
629 95
629 111
629 119
629 151
629 179
629 315
629 343
629 406
629 407
630 7
630 86
630 87
630 95
630 111
630 119
630 151
630 179
630 315
630 343
630 406

742 315
742 343
742 406
742 407
743 7
743 86
743 87
743 95
743 111
743 119
743 151
743 179
743 315
743 343
743 406
743 407
744 7
744 86
744 87
744 95
744 111
744 119
744 151
744 179
744 315
744 343
744 406
744 407
745 7
745 86
745 87
745 95
745 111
745 119
745 151
745 179
745 315
745 343
745 406
745 407
746 7
746 86
746 87
746 95
746 111
746 119
746 151
746 179
746 315
746 343
746 406
746 407
747 7
747 86
747 87
747 95
747 111
747 119
747 151
747 179
747 315
747 343
747 406
747 407
748 7
748 86
748 87
748 95
748 111
748 119
748 151
748 179
748 315
748 343
748 406
748 407
749 7
749 86
749 87
749 95
749 111
749 119
749 151
749 179
749 315
749 343
749 406
749 407
750 7
750 86
750 87
750 95
750 111
750 119
750 151
750 179
750 315
750 343
750 406
750 407
751 7
751 86
751 87
751 95
751 111
751 119
751 151
751 179
751 315
751 343
751 406
751 407
752 7
752 86
752 87
752 95
752 111
752 119
752 151
752 179
752 315
752 343
752 406
752 407
753 7
753 86
753 87
753 95
753 111
753 119
753 151
753 179

862 407
863 7
863 86
863 87
863 95
863 111
863 119
863 151
863 179
863 315
863 343
863 406
863 407
864 7
864 86
864 87
864 95
864 111
864 119
864 151
864 179
864 315
864 343
864 406
864 407
865 7
865 86
865 87
865 95
865 111
865 119
865 151
865 179
865 315
865 343
865 406
865 407
866 7
866 86
866 87
866 95
866 111
866 119
866 151
866 179
866 315
866 343
866 406
866 407
867 7
867 86
867 87
867 95
867 111
867 119
867 151
867 179
867 315
867 343
867 406
867 407
868 7
868 86
868 87
868 95
868 111
868 119
868 151
868 179
868 315
868 343
868 406
868 407
869 7
869 86
869 87
869 95
869 111
869 119
869 151
869 179
869 315
869 343
869 406
869 407
870 7
870 86
870 87
870 95
870 111
870 119
870 151
870 179
870 315
870 343
870 406
870 407
871 7
871 86
871 87
871 95
871 111
871 119
871 151
871 179
871 315
871 343
871 406
871 407
872 7
872 86
872 87
872 95
872 111
872 119
872 151
872 179
872 315
872 343
872 406
872 407
873 7
873 86
873 87
873 95
873 111
873 119
873 151
873 179
873 315
873 343
873 406

985 179
985 315
985 343
985 406
985 407
986 7
986 86
986 87
986 95
986 111
986 119
986 151
986 179
986 315
986 343
986 406
986 407
987 7
987 86
987 87
987 95
987 111
987 119
987 151
987 179
987 315
987 343
987 406
987 407
988 7
988 86
988 87
988 95
988 111
988 119
988 151
988 179
988 315
988 343
988 406
988 407
989 7
989 86
989 87
989 95
989 111
989 119
989 151
989 179
989 315
989 343
989 406
989 407
990 7
990 86
990 87
990 95
990 111
990 119
990 151
990 179
990 315
990 343
990 406
990 407
991 7
991 86
991 87
991 95
991 111
991 119
991 151
991 179
991 315
991 343
991 406
991 407
992 7
992 86
992 87
992 95
992 111
992 119
992 151
992 179
992 315
992 343
992 406
992 407
993 7
993 86
993 87
993 95
993 111
993 119
993 151
993 179
993 315
993 343
993 406
993 407
994 7
994 86
994 87
994 95
994 111
994 119
994 151
994 179
994 315
994 343
994 406
994 407
995 7
995 86
995 87
995 95
995 111
995 119
995 151
995 179
995 315
995 343
995 406
995 407
996 7
996 86
996 87
996 95
996 111
996 119
996 151

In [67]:
_2011df_.Geography.sample(1000)

8752     28128
30392    92107
13100    40316
20290    59636
31933    97108
29207    85346
3257     12424
929       3809
14707    45360
18772    55933
21802    63005
16380    49321
9668     30157
9766     30332
28106    80917
27577    79248
19672    57793
12585    38702
26782    77362
30084    90221
29976    89460
3276     12448
153       1034
5864     18830
32977    99649
17828    53529
24346    70722
15952    48393
13290    41064
2797     10988
         ...  
9661     30144
20474    60088
8921     28467
6491     21040
17189    51101
3981     14025
2713     10577
25269    73068
24765    72019
22702    65627
31941    97116
31899    97037
32118    97469
3468     12836
29711    88024
9335     29453
1377      4930
16551    49725
8551     27801
2993     11742
6787     22025
4392     15054
25929    75119
4086     14227
2124      7607
21947    63388
17038    50595
7783     25090
14456    44707
12922    39669
Name: Geography, Length: 1000, dtype: int64

In [61]:
# Convert DataFrame to matrix
mat = _2011df_.values
# Using sklearn
km = KMeans(n_clusters=5)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pandas.DataFrame([dataset.index,labels]).T

ValueError: could not convert string to float: '(X)'

In [None]:
# copy df for editing
k11 = y2k11.copy()

'''split into 2 df and rejoin after convert to int'''
# df to save
save_k11 = k11.copy()
# columns to save
save_k11 = save_k11.copy()[save_k11.columns[:3]]

# df to edit
switch_k11 = k11.copy()
# columns to edit
switch_k11 = switch_k11.copy()[switch_k11.columns[3:]]

# edited columns
swapped_k11 = switch_k11.copy().apply(pd.to_numeric, errors='ignore')

# new (edited) dataframe
new_k11 = pd.concat([save_k11,swapped_k11],axis=1)

len(new_k11.columns)

In [None]:
# # pull column names
# k11_cols = y2k11.copy().columns[4:]
# for c in range(len(k11_cols)):
#     a = pd.Series(k11[k11_cols])
# #     k11.loc[[k11_cols][c]] = pd.to_numeric(a,errors='ignore')
# # pd.Series(k11[k11_cols[0]])

In [None]:
k11_cols = y2k11.copy().columns
__k11_cols__ = y2k11.copy().columns[4:]
# len(k11_cols),len(__k11_cols__)
print(f'{__k11_cols__[:3]}\n{k11_cols[:3]}')

In [None]:
for frame in frames:
    print(frame.info(),'\n')

In [None]:
copies = frames[:2].copy()
for df in copies:
    for column in df.columns: 
        df[column] = pd.to_numeric([df[column]], errors='ignore')

- ***NOTE***:
    - path to mvp
        - whiteboard_pics/acs_5yr_11-17_path-to-mvp.jpg

In [None]:
dataset_array = frames[0].values
dataset_array

In [None]:
'''
STEP 2 >> find all columns which coexist across all dataframes at current position
'''
columns_by_frame = [frame.columns for frame in frames]
count_columns_by_frame = [len(frame) for frame in columns_by_frame]

In [None]:
out = []
# for range of df with most columns
for count in range(len(max(count_columns_by_frame))):
    # if index of every frame is same as index of frame with most columns
    if [frame for frame in columns_by_frame][count] == frames[6][count]:
        out.append(count)
out

In [None]:
# collect all
years = frames  # [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
for year in years:
    print(len(year.columns),'\n',year.info(),'\n\n')

In [None]:


X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

# array([1, 1, 1, 0, 0, 0], dtype=int32)
kmeans.predict([[0, 0], [12, 3]])
# array([1, 0], dtype=int32)
kmeans.cluster_centers_
# array([[10.,  2.], [ 1.,  2.]])

In [None]:


# Scikit learn plays really well with Pandas, so I suggest you use it. Here's an example:

# In [1]: 
# import pandas as pd
# import numpy as np
# from sklearn.cross_validation import train_test_split
data = np.reshape(np.random.randn(20),(10,2)) # 10 training examples
labels = np.random.randint(2, size=10) # 10 labels

# In [2]: 
X = pd.DataFrame(data)
y = pd.Series(labels)

# In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

# In [4]: X_test
# Out[4]:

#      0       1
# 2   -1.39   -1.86
# 8    0.48   -0.81
# 4   -0.10   -1.83

# In [5]: y_test
# Out[5]:

# 2    1
# 8    1
# 4    1

In [None]:
X_train

In [None]:
# q=pd.read_csv('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')

In [None]:
pizfsdazapizzaadsf = (2,1,0,4,32,7,2,9,5)
max(pizfsdazapizzaadsf)