In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

"""
STEP 0 >> imports; def clean_census & other functions
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df

'''
STEP 1 >> load data, reset; make copies
'''
def load_copy_data(i):
    '''
    loads data
    
    input)
        >> i
            > if 0
                >> .reset_index() after deleting row contining column names
            > if 1
                >> do not .reset_index()
                
    '''
    if i==0:
        # load with reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')
    if i==1:
        # load without reset
        # 2011 
        twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
        # 2012
        # twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
        #2013
        # twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
        # 2014
        # twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
        # 2015
        # twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
        #2016
        # twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
        #2017
        # twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    
    # copy 
    # 2011 
    _y2k11 = twenty_eleven.copy()
    # 2012
    # _y2k12 = twenty_twelve.copy()
    #2013
    # _y2k13 = twenty_thirteen.copy()
    # 2014
    # _y2k14 = twenty_fourteen.copy()
    # 2015
    # _y2k15 = twenty_fifteen.copy()
    #2016
    # _y2k16 = twenty_sixteen.copy()
    #2017
    # _y2k17 = twenty_seventeen.copy()
    
    # output list of copied frames
    return _y2k11  # [_y2k11,_y2k12,_y2k13,_y2k14,_y2k15,_y2k16,_y2k17]


def test_non_unique(column_names):
    '''
    input) 
        >> list of column names {column_names}
            > columns to check for duplicate instances
    output)
        >> indexed list of names occouring more than once 
    '''
    # store first instance
    first_occour = []
    # store 2nd+ instance(s)
    non_unique = []
    # we're going to want index
    for i,_ in enumerate(column_names):
        # not first time
        if _ not in first_occour:
            first_occour.append(_)
        # if not first, tag&bag
        else:
            non_unique.append([i,_])
    # output index w/ non-first instances
    return non_unique


def to_numeric_but(save_these_columns,dataframe):
    '''
    split into 2 df and rejoin after convert to int
    
    inputs:
        >> save_these_columns=number of columns to save
            > currently must include one end of df 
                >> might could run function multiple times to edit slices
                >> single number, not range (yet)
        >> dataframe
            > dataframe to shif to numeric (but)
    output:
        >> concatted pd.DataFrame of 
            > og columns you chose to save
            > columns converted to numeric
    '''
    # copy df for editing
    k = dataframe.copy()

    # columns to save
    save_k = k[k.columns[:save_these_columns]]
    # columns to edit
    switch_k = k[k.columns[save_these_columns:]]

    # edited columns  # coerce , ignore , raise
    swapped_k = switch_k.apply(pd.to_numeric, errors='coerce')

    # new (edited) dataframe
    new_k = pd.concat([save_k,swapped_k],axis=1)

    return new_k


def geography_to_zipcode_ids_to_numeric(dataframe):
    '''
    convert 
        >> .Geography values 
            > like 'ZCTA5 00601' 
            > to int(00601)
        >> .Id values
            > like '8600000US00601' 
            > to int(860000000601)
        >> .Id2 values
            > like '00601'
            > to int(00601)
    '''
    # copy
    df = dataframe.copy()
    
    # set old Geography
    geo = df.Geography
    # set old Id
    _id = df.Id
    # set old Id2
    __id2 = df.Id2
    
    # make new 'Geography' values
    new_geos = [int(i[-5:]) for i in geo]
    # new 'Id' values
    new_id = [int(''.join(i.split('US'))) for i in _id]
    # new .Id2 instances
    new__id2 = [int(d) for d in __id2]
    
    # convert dataframe
    new_df = df.copy()
    new_df.Geography = new_geos
    new_df.Id = new_id
    new_df.Id2 = new__id2
    
    # return new df
    return new_df


def kmeans_by(dataframe,n_clusters):
    '''
    inputs:
        >> dataframe
            > dataframe to be edited
        >> n_clusters
            > number of clusters for KMeans
    output:
        > pd.Dataframe of 
    '''
    # copy data 
    y2k11 = dataframe.copy()  

    # convert all but first 3 columns to numeric
    data = y2k11.copy()
    k2011 = to_numeric_but(save_these_columns=3,dataframe=data)

    # now convert the first 3 columns
    adjust_first_3 = k2011.copy()
    _2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)

    '''KMeans'''
    # fill NaN values
    t = _2011df_.copy().fillna(0)
    
    # Convert DataFrame to matrix
    mat = t.values
    
    # Using sklearn
    km = KMeans(n_clusters)
    # fit our values
    km.fit(mat)
    
    # Get cluster assignment labels
    labels = km.labels_
    
    # Format results as a DataFrame
    results = pd.DataFrame([t.index,labels]).T

    # display results
    return results

In [2]:
# load w/o reset
f = load_copy_data(1)

In [3]:
# copy for safeguard and hedge reload
frames = f.copy()

In [4]:
# extract copy of 2011 
y2k11 = frames.copy()  #[0]

In [5]:
# examine (2011)
y2k11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 327 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: object(327)
memory usage: 82.6+ MB


In [10]:
nuy11 = test_non_unique(y2k11)
len(nuy11)

32

In [6]:
# convert all but first 3 columns to numeric
data = y2k11.copy()
k2011 = to_numeric_but(save_these_columns=3,dataframe=data)

In [7]:
k2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: float64(235), int64(169), object(3)
memory usage: 102.8+ MB


In [8]:
nonuni = test_non_unique(k2011)
len(nonuni)

112

In [11]:
nuy11[:2] , nonuni[:2]

([[87, 'Estimate; SEX AND AGE - 18 years and over'],
  [88, 'Margin of Error; SEX AND AGE - 18 years and over']],
 [[8, 'Estimate; SEX AND AGE - Male'], [9, 'Estimate; SEX AND AGE - Male']])

In [12]:
len(set(k2011.columns[:10]))

8

In [13]:
for i in k2011.columns:
    if i not in y2k11.columns:
        print(i)

In [14]:
# now convert the first 3 columns
adjust_first_3 = k2011.copy()
_2011df_ = geography_to_zipcode_ids_to_numeric(adjust_first_3)

In [None]:
# rand_test = _2011df_.copy()
# rand_test['Margin of Error; SEX AND AGE - Total population'].apply(lambda x : 0 if x =='*****' else x)

In [15]:
_2011df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: float64(235), int64(172)
memory usage: 102.8 MB


In [None]:
# _2011df_ = _2011df_.dropna()  # , how='any'

In [16]:
# no change
_2011df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: float64(235), int64(172)
memory usage: 102.8 MB


In [None]:
# _2011df_.head(15)
# _2011df_ = _2011df_.apply(pd.to_numeric, errors='coerce')

In [None]:
# _2011df_['Margin of Error; SEX AND AGE - Total population'].apply(lambda x : 0 for x in i if x =='*****' else x)

In [17]:
x=-1
x_out=set()
star_out=set()
other_out=set()
for i in _2011df_.sample(100,axis=0).values:
    x+=1
    y=0
    for j in i:
        y+=1
        if j == '(X)':
            x_out.add((x-1,y))
        if j == '*****':
            star_out.add((x-1,y))
        if j == '**':
            star_out.add((x-1,y))

In [18]:
x_out

set()

In [19]:
star_out

set()

In [20]:
other_out

set()

In [21]:
collectj = []
for i,j in x_out: 
    collectj.append(j)

In [22]:
collecti = []
for j,i in star_out: 
    collecti.append(i)


In [23]:
count_out=[]
for n in set(collectj):
    q = collectj.count(n)
    count_out.append((n,q))

In [24]:
count_in=[]
for m in set(collecti):
    p = collectj.count(m)
    count_in.append((m,p))

In [25]:
c = pd.DataFrame(data=count_out,columns=['column','count'])
c

Unnamed: 0,column,count


In [26]:
r = pd.DataFrame(data=count_in,columns=['row','count'])
r


Unnamed: 0,row,count


In [27]:
test = _2011df_.copy()

In [None]:
cols_to_drop = [_2011df_.columns[q-1] for q,b in count_out]
stars = [_2011df_.columns[q-1] for q,b in count_out]

In [29]:
len(cols_to_drop), len(stars)

(0, 0)

In [28]:
# convert '*****' to 0
for i in stars: 
    # test.drop(stars,axis=1)
    test[i].apply(lambda x : 0 if x == '*****' else x)
# drop columns with '(X)'
# for cols in 
test = test.drop(cols_to_drop,axis=1)

In [30]:
test.info()  #['Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 1 to 33120
Columns: 407 entries, Id to Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
dtypes: float64(235), int64(172)
memory usage: 102.8 MB


In [None]:
# # ivalues = [i for i in test.values]
# # # [i for i in ivalues if i == '**']
# # for i in range(len(test.values)):
# #     if test.values[i].any() == '**':
# #         print(i,test.values[i])
# # sus=set()
# # for i in test:
# #     for x in test[i]:
# #         if x == '**':
# #             sus.add(i)

# for i in test.columns:
#     test[i].apply(lambda q: 0 if x == '**' else x)

- ***notes***:
    - that wasn't too hard
- ***actions***:
    - drop these columns

In [34]:
t = test.copy().fillna(0)
# Convert DataFrame to matrix
mat = t.values
# Using sklearn
km = KMeans(n_clusters=5)
# fit our values
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame([t.index,labels]).T

In [35]:
results

Unnamed: 0,0,1
0,1,2
1,2,2
2,3,3
3,4,4
4,5,2
5,6,3
6,7,4
7,8,2
8,9,4
9,10,3


In [None]:
# copy df for editing
k11 = y2k11.copy()

'''split into 2 df and rejoin after convert to int'''
# df to save
save_k11 = k11.copy()
# columns to save
save_k11 = save_k11.copy()[save_k11.columns[:3]]

# df to edit
switch_k11 = k11.copy()
# columns to edit
switch_k11 = switch_k11.copy()[switch_k11.columns[3:]]

# edited columns
swapped_k11 = switch_k11.copy().apply(pd.to_numeric, errors='ignore')

# new (edited) dataframe
new_k11 = pd.concat([save_k11,swapped_k11],axis=1)

len(new_k11.columns)

In [None]:
# # pull column names
# k11_cols = y2k11.copy().columns[4:]
# for c in range(len(k11_cols)):
#     a = pd.Series(k11[k11_cols])
# #     k11.loc[[k11_cols][c]] = pd.to_numeric(a,errors='ignore')
# # pd.Series(k11[k11_cols[0]])

In [None]:
k11_cols = y2k11.copy().columns
__k11_cols__ = y2k11.copy().columns[4:]
# len(k11_cols),len(__k11_cols__)
print(f'{__k11_cols__[:3]}\n{k11_cols[:3]}')

In [None]:
for frame in frames:
    print(frame.info(),'\n')

In [None]:
copies = frames[:2].copy()
for df in copies:
    for column in df.columns: 
        df[column] = pd.to_numeric([df[column]], errors='ignore')

- ***NOTE***:
    - path to mvp
        - whiteboard_pics/acs_5yr_11-17_path-to-mvp.jpg

In [None]:
dataset_array = frames[0].values
dataset_array

In [None]:
'''
STEP 2 >> find all columns which coexist across all dataframes at current position
'''
columns_by_frame = [frame.columns for frame in frames]
count_columns_by_frame = [len(frame) for frame in columns_by_frame]

In [None]:
out = []
# for range of df with most columns
for count in range(len(max(count_columns_by_frame))):
    # if index of every frame is same as index of frame with most columns
    if [frame for frame in columns_by_frame][count] == frames[6][count]:
        out.append(count)
out

In [None]:
# collect all
years = frames  # [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
for year in years:
    print(len(year.columns),'\n',year.info(),'\n\n')

In [None]:


X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

# array([1, 1, 1, 0, 0, 0], dtype=int32)
kmeans.predict([[0, 0], [12, 3]])
# array([1, 0], dtype=int32)
kmeans.cluster_centers_
# array([[10.,  2.], [ 1.,  2.]])

In [None]:


# Scikit learn plays really well with Pandas, so I suggest you use it. Here's an example:

# In [1]: 
# import pandas as pd
# import numpy as np
# from sklearn.cross_validation import train_test_split
data = np.reshape(np.random.randn(20),(10,2)) # 10 training examples
labels = np.random.randint(2, size=10) # 10 labels

# In [2]: 
X = pd.DataFrame(data)
y = pd.Series(labels)

# In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

# In [4]: X_test
# Out[4]:

#      0       1
# 2   -1.39   -1.86
# 8    0.48   -0.81
# 4   -0.10   -1.83

# In [5]: y_test
# Out[5]:

# 2    1
# 8    1
# 4    1

In [None]:
X_train

In [None]:
# q=pd.read_csv('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')

In [None]:
pizfsdazapizzaadsf = (2,1,0,4,32,7,2,9,5)
max(pizfsdazapizzaadsf)

In [None]:
x=[3,2,1,4,6,5]
print(x)