## Internet Access in Louisville

In [57]:
import pandas as pd
import numpy as np

In [58]:
df = pd.read_csv("jeff_co_int.csv")

In [59]:
list(df)

['YEAR',
 'SAMPLE',
 'SERIAL',
 'CBSERIAL',
 'HHWT',
 'CLUSTER',
 'STATEFIP',
 'METRO',
 'MET2013',
 'PUMA',
 'STRATA',
 'GQ',
 'CINETHH',
 'CILAPTOP',
 'CISMRTPHN',
 'CITABLET',
 'CIHISPEED',
 'CIDIAL',
 'PERNUM',
 'PERWT',
 'SEX',
 'AGE',
 'RACE',
 'RACED',
 'HISPAN',
 'HISPAND',
 'EDUC',
 'EDUCD',
 'POVERTY',
 'city',
 'int_acc',
 'hspd_int',
 'computer',
 'tablet',
 'comp_tab',
 'hspd_dev']

There is no case_when in pandas. So it's either a nested else if, or just create a new column and fill in the values. I find filling in the values easier to read than a nested else if

In [60]:
# Start by creating a new column and filling it with NA values
df['hspd'] = np.nan

# Any time high speed internet is between 10 and 17 in the Census codebook it means they do have internet
df['hspd'].loc[((df['CIHISPEED'] >= 10) & (df['CIHISPEED'] <= 17))] = 1

# If they don't have internet at all and are NA for the high speed internet question then we want them marked as not having high speed internet, not as an NA value
df['hspd'].loc[((df['CINETHH'] == 3) & (df['CIHISPEED'] == 0))] = 0

# Finally if they say they don't have high speed internet we want them marked as not having high speed internet
df['hspd'].loc[df['CIHISPEED'] == 20] = 0


Now we need to use census weights to appropriately estimate the percentage of people with high speed internet in their households. 

In [61]:
# df18 = df.query('YEAR == 2018')
df18 = df[df.YEAR == 2018]

In [62]:
per_int = df18['PERWT'].loc[df18['hspd'] == 1].sum()
per_no_int = df18['PERWT'].loc[df18['hspd'] == 0].sum()
per_total = df18['PERWT'].sum()
per_int_percent = per_int/per_total

In [63]:
per_int_percent

0.7204147671636177

In [64]:
per_total

771035

In [69]:
year_totals = df.groupby(by = ['hspd', 'YEAR'])['PERWT'].sum()
year_totals

hspd  YEAR
0.0   2013    161299
      2014    170809
      2015    144626
      2016    158520
      2017    160849
      2018    184795
1.0   2013    544770
      2014    526480
      2015    548358
      2016    573787
      2017    577340
      2018    555465
Name: PERWT, dtype: int64

Pivoting the dataframe is probably the easiest way to sum and get percentages now

In [66]:
year_percent = year_totals.groupby(by = 'YEAR').sum()

hspd  YEAR
0.0   2013    16.444014
      2014    17.413533
      2015    14.744245
      2016    16.160702
      2017    16.398137
      2018    18.839370
1.0   2013    16.378149
      2014    15.828273
      2015    16.486020
      2016    17.250526
      2017    17.357345
      2018    16.699687
Name: PERWT, dtype: float64

In [67]:
df['hspd'].describe()


count    46480.000000
mean         0.808498
std          0.393487
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: hspd, dtype: float64

In [68]:
df['hspd_int'].describe()

count    46480.000000
mean         0.808498
std          0.393487
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: hspd_int, dtype: float64