In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

"""
STEP 0 >> imports; def clean
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df

'''
STEP 1 >> load data, reset; make copies
'''
def load_copy_data(i):
    '''
    loads data
    
    input)
        >> i
            > if 0
                >> .reset_index() after deleting row contining column names
            > if 1
                >> do not .reset_index()
                
    '''
    if i==0:
        # load with reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
        # 2012
        twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
        #2013
        twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
        # 2014
        twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
        # 2015
        twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
        #2016
        twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
        #2017
        twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')
    if i==1:
        # load without reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
        # 2012
        twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
        #2013
        twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
        # 2014
        twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
        # 2015
        twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
        #2016
        twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
        #2017
        twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    
    # copy 
    # 2011 
    _y2k11 = twenty_eleven.copy()
    # 2012
    _y2k12 = twenty_twelve.copy()
    #2013
    _y2k13 = twenty_thirteen.copy()
    # 2014
    _y2k14 = twenty_fourteen.copy()
    # 2015
    _y2k15 = twenty_fifteen.copy()
    #2016
    _y2k16 = twenty_sixteen.copy()
    #2017
    _y2k17 = twenty_seventeen.copy()
    
    # output list of copied frames
    return [_y2k11,_y2k12,_y2k13,_y2k14,_y2k15,_y2k16,_y2k17]

In [2]:
# load w/o reset
f = load_copy_data(1)

In [3]:
# copy for safeguard and hedge reload
frames = f.copy()

In [4]:
# extract copy of 2011 
y2k11 = frames.copy()[0]

In [5]:
# examine (2011)
y2k11

Unnamed: 0,Id,Id2,Geography,Estimate; SEX AND AGE - Total population,Margin of Error; SEX AND AGE - Total population,Percent; SEX AND AGE - Total population,Percent Margin of Error; SEX AND AGE - Total population,Estimate; SEX AND AGE - Male,Margin of Error; SEX AND AGE - Male,Percent; SEX AND AGE - Male,...,Percent; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races including Some other race,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races including Some other race,"Estimate; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races",Estimate; HISPANIC OR LATINO AND RACE - Total housing units,Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units,Percent; HISPANIC OR LATINO AND RACE - Total housing units,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
1,8600000US00601,00601,ZCTA5 00601,18533,310,18533,(X),8971,208,48.4,...,0.0,0.2,0,92,0.0,0.2,6503,164,(X),(X)
2,8600000US00602,00602,ZCTA5 00602,41930,136,41930,(X),20568,84,49.1,...,0.0,0.1,228,125,0.5,0.3,16336,225,(X),(X)
3,8600000US00603,00603,ZCTA5 00603,54475,839,54475,(X),26588,459,48.8,...,0.0,0.1,250,130,0.5,0.2,23245,334,(X),(X)
4,8600000US00606,00606,ZCTA5 00606,6386,291,6386,(X),3085,186,48.3,...,0.0,0.5,0,92,0.0,0.5,2373,111,(X),(X)
5,8600000US00610,00610,ZCTA5 00610,29111,173,29111,(X),14162,86,48.6,...,0.0,0.1,0,92,0.0,0.1,11308,147,(X),(X)
6,8600000US00612,00612,ZCTA5 00612,70541,1436,70541,(X),33449,835,47.4,...,0.0,0.1,40,46,0.1,0.1,29575,452,(X),(X)
7,8600000US00616,00616,ZCTA5 00616,10617,866,10617,(X),5166,505,48.7,...,0.0,0.3,3,6,0.0,0.1,4466,264,(X),(X)
8,8600000US00617,00617,ZCTA5 00617,24458,261,24458,(X),11752,98,48.0,...,0.0,0.1,17,28,0.1,0.1,9627,192,(X),(X)
9,8600000US00622,00622,ZCTA5 00622,5419,854,5419,(X),2392,400,44.1,...,0.0,0.6,0,92,0.0,0.6,7042,351,(X),(X)
10,8600000US00623,00623,ZCTA5 00623,45314,854,45314,(X),21904,400,48.3,...,0.0,0.1,0,92,0.0,0.1,19997,390,(X),(X)


In [37]:
# copy df for editing
k11 = y2k11.copy()

'''split into 2 df and rejoin after convert to int'''
# df to save
save_k11 = k11.copy()
# columns to save
save_k11 = save_k11.copy()[save_k11.columns[:3]]

# df to edit
switch_k11 = k11.copy()
# columns to edit
switch_k11 = switch_k11.copy()[switch_k11.columns[3:]]

# edited columns
swapped_k11 = switch_k11.copy().apply(pd.to_numeric, errors='ignore')

# new (edited) dataframe
new_k11 = pd.concat([save_k11,swapped_k11],axis=1)

len(new_k11.columns)

407

In [29]:
# # pull column names
# k11_cols = y2k11.copy().columns[4:]
# for c in range(len(k11_cols)):
#     a = pd.Series(k11[k11_cols])
# #     k11.loc[[k11_cols][c]] = pd.to_numeric(a,errors='ignore')
# # pd.Series(k11[k11_cols[0]])

In [None]:
k11_cols = y2k11.copy().columns
__k11_cols__ = y2k11.copy().columns[4:]
# len(k11_cols),len(__k11_cols__)
print(f'{__k11_cols__[:3]}\n{k11_cols[:3]}')

In [None]:
for frame in frames:
    print(frame.info(),'\n')

In [None]:
copies = frames[:2].copy()
for df in copies:
    for column in df.columns: 
        df[column] = pd.to_numeric([df[column]], errors='ignore')

- ***NOTE***:
    - path to mvp
        - whiteboard_pics/acs_5yr_11-17_path-to-mvp.jpg

In [None]:
dataset_array = frames[0].values
dataset_array

In [None]:
'''
STEP 2 >> find all columns which coexist across all dataframes at current position
'''
columns_by_frame = [frame.columns for frame in frames]
count_columns_by_frame = [len(frame) for frame in columns_by_frame]

In [None]:
out = []
# for range of df with most columns
for count in range(len(max(count_columns_by_frame))):
    # if index of every frame is same as index of frame with most columns
    if [frame for frame in columns_by_frame][count] == frames[6][count]:
        out.append(count)
out

In [None]:
# collect all
years = frames  # [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
for year in years:
    print(len(year.columns),'\n',year.info(),'\n\n')

In [None]:
from sklearn.cluster import KMeans
import numpy as np

X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

# array([1, 1, 1, 0, 0, 0], dtype=int32)
kmeans.predict([[0, 0], [12, 3]])
# array([1, 0], dtype=int32)
kmeans.cluster_centers_
# array([[10.,  2.], [ 1.,  2.]])

In [None]:


# Scikit learn plays really well with Pandas, so I suggest you use it. Here's an example:

# In [1]: 
# import pandas as pd
# import numpy as np
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
data = np.reshape(np.random.randn(20),(10,2)) # 10 training examples
labels = np.random.randint(2, size=10) # 10 labels

# In [2]: 
X = pd.DataFrame(data)
y = pd.Series(labels)

# In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

# In [4]: X_test
# Out[4]:

#      0       1
# 2   -1.39   -1.86
# 8    0.48   -0.81
# 4   -0.10   -1.83

# In [5]: y_test
# Out[5]:

# 2    1
# 8    1
# 4    1

In [None]:
X_train

In [None]:
# q=pd.read_csv('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')

In [None]:
pizfsdazapizzaadsf = (2,1,0,4,32,7,2,9,5)
max(pizfsdazapizzaadsf)