## Internet Access in Louisville

In [94]:
import pandas as pd
import numpy as np

In [95]:
# open a file with a context manager

# The context manager will close file from systems memory but retain variables. pd.read_csv does not clean up by closing the file 
# with(open(a))

df = pd.read_csv("jeff_co_int.csv")


In [96]:
list(df)

['YEAR',
 'SAMPLE',
 'SERIAL',
 'CBSERIAL',
 'HHWT',
 'CLUSTER',
 'STATEFIP',
 'METRO',
 'MET2013',
 'PUMA',
 'STRATA',
 'GQ',
 'CINETHH',
 'CILAPTOP',
 'CISMRTPHN',
 'CITABLET',
 'CIHISPEED',
 'CIDIAL',
 'PERNUM',
 'PERWT',
 'SEX',
 'AGE',
 'RACE',
 'RACED',
 'HISPAN',
 'HISPAND',
 'EDUC',
 'EDUCD',
 'POVERTY',
 'city',
 'int_acc',
 'hspd_int',
 'computer',
 'tablet',
 'comp_tab',
 'hspd_dev']

There is no case_when in pandas. So it's either a nested else if, or just create a new column and fill in the values. I find filling in the values easier to read than a nested else if

There is a case_when statement in numpy, so you can use that. 
https://note.nkmk.me/en/python-numpy-where/

In [97]:
# Start by creating a new column and filling it with NA values
df['hspd'] = np.nan

# Any time high speed internet is between 10 and 17 in the Census codebook it means they do have internet
df['hspd'].loc[((df['CIHISPEED'] >= 10) & (df['CIHISPEED'] <= 17))] = 1

# If they don't have internet at all and are NA for the high speed internet question then we want them marked as not having high speed internet, not as an NA value
df['hspd'].loc[((df['CINETHH'] == 3) & (df['CIHISPEED'] == 0))] = 0

# Finally if they say they don't have high speed internet we want them marked as not having high speed internet
df['hspd'].loc[df['CIHISPEED'] == 20] = 0


Now we need to use census weights to appropriately estimate the percentage of people with high speed internet in their households. 

In [98]:
# df18 = df.query('YEAR == 2018')
df18 = df[df.YEAR == 2018]

In [99]:
per_int = df18['PERWT'].loc[df18['hspd'] == 1].sum()
per_no_int = df18['PERWT'].loc[df18['hspd'] == 0].sum()
per_total = df18['PERWT'].sum()
per_int_percent = per_int/per_total

In [100]:
per_int_percent

0.7204147671636177

In [101]:
per_total

771035

In [102]:
year_totals = df.groupby(by = ['hspd', 'YEAR'], as_index = False)['PERWT'].sum() # if you don't use as_index = False you get a weird labeled series back that's hard to work with
year_totals
# if you want a specific column you can .sum([[col_name]])

Unnamed: 0,hspd,YEAR,PERWT
0,0.0,2013,161299
1,0.0,2014,170809
2,0.0,2015,144626
3,0.0,2016,158520
4,0.0,2017,160849
5,0.0,2018,184795
6,1.0,2013,544770
7,1.0,2014,526480
8,1.0,2015,548358
9,1.0,2016,573787


Pivoting the dataframe is probably the easiest way to sum and get percentages now

In [103]:
type(year_totals)

pandas.core.frame.DataFrame

In [104]:
year_wide = year_totals.pivot(index = 'YEAR', columns = 'hspd', values = 'PERWT')
year_wide   

hspd,0.0,1.0
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,161299,544770
2014,170809,526480
2015,144626,548358
2016,158520,573787
2017,160849,577340
2018,184795,555465


In [105]:
year_wide.columns

Float64Index([0.0, 1.0], dtype='float64', name='hspd')

In [106]:
year_wide.index

Int64Index([2013, 2014, 2015, 2016, 2017, 2018], dtype='int64', name='YEAR')

In [107]:
year_wide.columns.name


'hspd'

In [108]:
year_wide.columns.name = ''
year_wide.rename(columns = {0.0 : 'hspd_no', 1.0 : 'hspd_yes'}, inplace = True)

In [109]:
year_wide.reset_index(inplace = True)

In [110]:

year_wide['pct'] = year_wide['hspd_yes'] / (year_wide['hspd_yes'] + year_wide['hspd_no'])
year_wide = year_wide.reset_index(drop = True)
#year_wide.loc[(2013), :]
year_wide

Unnamed: 0,YEAR,hspd_no,hspd_yes,pct
0,2013,161299,544770,0.771553
1,2014,170809,526480,0.755038
2,2015,144626,548358,0.7913
3,2016,158520,573787,0.783533
4,2017,160849,577340,0.782103
5,2018,184795,555465,0.750365


In [111]:
#how do I get rid of the hspd column that seem to be niether a column nor an index. It insist that the index is YEAR...and I can't touch hspd which is meaningingless