<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/main/notebooks/Analysis%20CH%20Sampling%20Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis CH Sampling Bias

**Instructions:**

1. Execute the first code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [1]:
# Mount drive with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path

# Set path to data folder
path = Path('drive/MyDrive/big-data-project/data/clean-data')
city = 'ch'

**Spark SQL Documentation:** 
https://spark.apache.org/docs/2.2.0/sql-programming-guide.html

In [3]:
# Install required dependancies
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 40.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=f7b03546d291ae22c04178567211948035f259521e60b1ad8a8311c43fb938b6
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1
The 

In [4]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [5]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
# # Demographic Data Chicago
# !ls drive/MyDrive/big-data-project/data/clean-data/ch/ch.csv

# !ls drive/MyDrive/big-data-project/data/clean-data/ch/social/2020/



In [8]:
# Read data into dataframe
df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'social/2020/*.csv'))
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_time|median_percentage_time_

In [9]:
# View schema
df_soc.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- date_range_end: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- distance_traveled_from_home: string (nullable = true)
 |-- bucketed_distance_traveled: string (nullable = true)
 |-- median_dwell_at_bucketed_distance_traveled: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)
 |-- median_home_dwell_time: string (nullable = true)
 |-- bucketed_home_dwell_time: string (nullable = true)
 |-- at_home_by_each_hour: string (nullable = true)
 |-- part_time_work_behavior_devices: string (nullable = true)
 |-- full_time_work_behavior_devices: string (nullable = true)
 |-- destination_cbgs: string (nullable = true)
 |-- delivery_behavior_devices: string (nullable = true)
 |-- median_non_home_dwell_time: string (nullable = true)
 |-- candidate_device_count: string (nullable = true)
 |-- bucketed_away_from_home_ti

In [10]:
# # Take small sample of data to experiment with
# sm = df.limit(100)
# sm.show()

In [11]:
df_soc.createOrReplaceTempView('T')
df_soc = spark.sql('SELECT cbg, date_range_start, device_count, completely_home_device_count FROM T')
df_soc.show()

+------------+--------------------+------------+----------------------------+
|         cbg|    date_range_start|device_count|completely_home_device_count|
+------------+--------------------+------------+----------------------------+
|170312304002|2020-02-07T00:00:...|          76|                          22|
|170313005002|2020-02-07T00:00:...|          72|                          27|
|170314208001|2020-02-07T00:00:...|         127|                          33|
|170317608034|2020-02-07T00:00:...|         114|                          37|
|170318033003|2020-02-07T00:00:...|          91|                          29|
|170318146005|2020-02-07T00:00:...|          51|                          11|
|170318182002|2020-02-07T00:00:...|          51|                          19|
|170318240053|2020-02-07T00:00:...|         123|                          21|
|170318348001|2020-02-07T00:00:...|          32|                          12|
|170438411042|2020-02-07T00:00:...|          57|                

In [12]:
# View schema
df_soc.printSchema()

root
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)



In [13]:
# Add completely home percentage column
df_soc = spark.sql('SELECT *, CAST(completely_home_device_count AS float)/CAST(device_count AS float)*100.0 AS completely_home_percentage FROM T')
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+--------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_t

In [14]:
df_soc.createOrReplaceTempView('t')
# get mean percentage for each cbg
df_soc = spark.sql(
'''
SELECT cbg, AVG(device_count) AS mean_device_count
FROM t
GROUP BY cbg
'''
)
df_soc.createOrReplaceTempView('t')
df_soc.show()

+------------+------------------+
|         cbg| mean_device_count|
+------------+------------------+
|170318279025| 57.51639344262295|
|170314202001| 77.61748633879782|
|170314401021| 48.34426229508197|
|170310103003|52.549180327868854|
|170310608001| 45.95628415300546|
|171978803141|201.23497267759564|
|170314302003| 44.33879781420765|
|170438462012|  64.6584699453552|
|170978614041|  73.0846994535519|
|170318042012|190.69398907103826|
|180890302002| 33.32513661202186|
|170312504001|21.885245901639344|
|170313012003| 51.72677595628415|
|170318256001| 59.87978142076503|
|170313012002|20.224043715846996|
|170318433001| 64.30327868852459|
|170978610123| 55.38251366120219|
|170315203004|45.076502732240435|
|171118708071| 120.5327868852459|
|170318049014|36.177595628415304|
+------------+------------------+
only showing top 20 rows



In [15]:
# Read poverty data
# Read data into dataframe
df_pov = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'ch.csv'))
df_pov.createOrReplaceTempView('d')
df_pov.show()

+------------+---------+------------------+-------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|     perc_whiteonly|    perc_blackonly|    perc_asianonly|
+------------+---------+------------------+-------------------+------------------+------------------+
|170312909001|      935|  55.8288770053476|                0.0|             100.0|               0.0|
|170312909002|     1129|45.261293179805136|                0.0| 99.02568644818423|               0.0|
|170312909003|      765|  39.7078353253652|                0.0|             100.0|               0.0|
|170312909004|     1075|30.325581395348838|                0.0| 98.79069767441861|1.2093023255813953|
|170318387002|     1016|31.003937007874015| 13.484251968503939| 86.51574803149606|               0.0|
|170313102001|      835|16.766467065868262|  78.20359281437126| 2.155688622754491| 5.868263473053893|
|170313102002|      686| 16.61807580174927| 50.583090379008745|  8.16326530612245|

In [16]:
# join mobility and poverty
result = spark.sql('SELECT d.*, t.mean_device_count FROM t INNER JOIN d ON t.cbg = d.cbg')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|    perc_blackonly|    perc_asianonly| mean_device_count|
+------------+---------+------------------+------------------+------------------+------------------+------------------+
|170318279025|     1127|20.496894409937887| 58.47382431233363| 39.39662821650399|               0.0| 57.51639344262295|
|170314202001|     1720| 32.93571901921803|21.627906976744185| 69.82558139534883| 4.534883720930233| 77.61748633879782|
|170314401021|     1165| 35.79399141630901|12.188841201716738| 87.81115879828326|               0.0| 48.34426229508197|
|170310103003|     1291| 40.63745019920319| 64.52362509682416|30.286599535243997|3.4082106893880715|52.549180327868854|
|170310608001|     1770| 5.367231638418079| 79.03954802259886| 5.649717514124294|11.016949152542372| 45.95628415300546|
|171978803141|     1780|2.86516853932584

In [17]:
# Count number of rows
spark.sql('SELECT COUNT(cbg) FROM t').show()

+----------+
|count(cbg)|
+----------+
|      6664|
+----------+



In [18]:
result = spark.sql('SELECT *, mean_device_count / pop_total * 100 AS sample_size_perc FROM t')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|    perc_blackonly|    perc_asianonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+------------------+------------------+
|170318279025|     1127|20.496894409937887| 58.47382431233363| 39.39662821650399|               0.0| 57.51639344262295|  5.10349542525492|
|170314202001|     1720| 32.93571901921803|21.627906976744185| 69.82558139534883| 4.534883720930233| 77.61748633879782| 4.512644554581269|
|170314401021|     1165| 35.79399141630901|12.188841201716738| 87.81115879828326|               0.0| 48.34426229508197| 4.149722085414761|
|170310103003|     1291| 40.63745019920319| 64.52362509682416|30.286599535243997|3.4082106893880715|52.549180327868854|4.0704245025459995|
|170310608001|     1770| 5.

In [19]:
# sort by poverty
result = spark.sql('SELECT * FROM t ORDER BY CAST(sample_size_perc AS float) DESC')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|    perc_blackonly|    perc_asianonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+------------------+------------------+
|170938901021|      476|               0.0|             100.0|               0.0|               0.0| 499.1120218579235|104.85546677687468|
|170318262011|      601|17.970049916805326|23.793677204658902|  76.2063227953411|               0.0|             363.5| 60.48252911813644|
|180890430021|     1074| 9.217877094972067| 92.83054003724395|               0.0|               0.0| 557.9918032786885|51.954544066916995|
|170898525002|      821| 9.866017052375152| 84.53105968331303|4.6285018270401945|               0.0| 387.0081967213115| 47.13863541063477|
|170318410001|      830| 20

In [20]:
# group by poverty range


In [21]:
result_df = result.toPandas()
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,perc_blackonly,perc_asianonly,mean_device_count,sample_size_perc
0,170938901021,476,0.0,100.0,0.0,0.0,499.112022,104.855467
1,170318262011,601,17.970049916805326,23.793677204658902,76.2063227953411,0.0,363.500000,60.482529
2,180890430021,1074,9.217877094972067,92.83054003724395,0.0,0.0,557.991803,51.954544
3,170898525002,821,9.866017052375152,84.53105968331303,4.6285018270401945,0.0,387.008197,47.138635
4,170318410001,830,20.96385542168675,22.53012048192771,42.168674698795186,32.89156626506024,377.448087,45.475673
...,...,...,...,...,...,...,...,...
6659,170310619024,0,0.0,0.0,0.0,0.0,8.218341,
6660,550599900000,0,0.0,0.0,0.0,0.0,10.722892,
6661,170310609002,0,0.0,0.0,0.0,0.0,10.678363,
6662,170978630061,0,0.0,0.0,0.0,0.0,15.235294,


In [22]:
result_df = result_df[result_df['pop_total'] != '0']
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,perc_blackonly,perc_asianonly,mean_device_count,sample_size_perc
0,170938901021,476,0.0,100.0,0.0,0.0,499.112022,104.855467
1,170318262011,601,17.970049916805326,23.793677204658902,76.2063227953411,0.0,363.500000,60.482529
2,180890430021,1074,9.217877094972067,92.83054003724395,0.0,0.0,557.991803,51.954544
3,170898525002,821,9.866017052375152,84.53105968331303,4.6285018270401945,0.0,387.008197,47.138635
4,170318410001,830,20.96385542168675,22.53012048192771,42.168674698795186,32.89156626506024,377.448087,45.475673
...,...,...,...,...,...,...,...,...
6647,170313201001,2935,3.953147877013177,77.27427597955707,5.349233390119251,14.991482112436117,34.549180,1.177144
6648,170313011002,1943,48.98911353032659,41.1219763252702,4.5290787442099845,0.0,22.147541,1.139863
6649,170310811002,1295,33.513513513513516,48.18532818532819,30.73359073359073,6.872586872586872,13.117978,1.012971
6650,170318435001,9364,22.041984732824428,28.748398120461342,66.34985049124306,0.373771892353695,49.811475,0.531947


In [43]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('poverty_percentage:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,50))),
    tooltip=['cbg','poverty_percentage','sample_size_perc']
).properties(width=300, height=300).interactive()

In [44]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
temp
temp['color'] = ['#45a0d1' for x in temp['sample_size_perc']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",89.711444,5.804827,0.265395,#45a0d1
1,"(20, 40]",59.653167,4.827411,-0.712022,#45a0d1
2,"(40, 60]",50.878414,4.677717,-0.861715,#45a0d1
3,"(60, 80]",43.384592,4.174526,-1.364906,#45a0d1
4,"(80, 100]",45.16272,5.062847,-0.476585,#45a0d1


In [45]:
# 
alt.Chart(temp).mark_bar(size=40).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()

In [26]:
result_df['poverty_percentage'].astype(float).dtype

dtype('float64')

In [27]:
mean_sample_size = result_df['sample_size_perc'].mean()
mean_sample_size

5.539432635282737

In [28]:
result_df['sample_size_perc_deviation'] = result_df['sample_size_perc'] - mean_sample_size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,perc_blackonly,perc_asianonly,mean_device_count,sample_size_perc,sample_size_perc_deviation
0,170938901021,476,0.0,100.0,0.0,0.0,499.112022,104.855467,99.316034
1,170318262011,601,17.970049916805326,23.793677204658902,76.2063227953411,0.0,363.500000,60.482529,54.943096
2,180890430021,1074,9.217877094972067,92.83054003724395,0.0,0.0,557.991803,51.954544,46.415111
3,170898525002,821,9.866017052375152,84.53105968331303,4.6285018270401945,0.0,387.008197,47.138635,41.599203
4,170318410001,830,20.96385542168675,22.53012048192771,42.168674698795186,32.89156626506024,377.448087,45.475673,39.936241
...,...,...,...,...,...,...,...,...,...
6647,170313201001,2935,3.953147877013177,77.27427597955707,5.349233390119251,14.991482112436117,34.549180,1.177144,-4.362289
6648,170313011002,1943,48.98911353032659,41.1219763252702,4.5290787442099845,0.0,22.147541,1.139863,-4.399569
6649,170310811002,1295,33.513513513513516,48.18532818532819,30.73359073359073,6.872586872586872,13.117978,1.012971,-4.526461
6650,170318435001,9364,22.041984732824428,28.748398120461342,66.34985049124306,0.373771892353695,49.811475,0.531947,-5.007486


In [30]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
temp
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",89.711444,5.804827,0.265395,#4e79a7
1,"(20, 40]",59.653167,4.827411,-0.712022,#e15759
2,"(40, 60]",50.878414,4.677717,-0.861715,#e15759
3,"(60, 80]",43.384592,4.174526,-1.364906,#e15759
4,"(80, 100]",45.16272,5.062847,-0.476585,#e15759


In [31]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-1.4,1.4))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [32]:
["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

['#4e79a7',
 '#f28e2c',
 '#e15759',
 '#76b7b2',
 '#59a14f',
 '#edc949',
 '#af7aa1',
 '#ff9da7',
 '#9c755f',
 '#bab0ab']

In [33]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('perc_whiteonly:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,10))),
    tooltip=['cbg','perc_whiteonly','sample_size_perc']
).properties(width=300, height=300).interactive()

In [34]:
temp = result_df.groupby(pd.cut(result_df['perc_whiteonly'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['perc_whiteonly'] = temp['perc_whiteonly'].astype(str)
temp
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,perc_whiteonly,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",53.642361,4.8479,-0.691532,#e15759
1,"(20, 40]",63.765879,4.827277,-0.712156,#e15759
2,"(40, 60]",68.183734,4.654918,-0.884515,#e15759
3,"(60, 80]",84.48225,5.365753,-0.17368,#e15759
4,"(80, 100]",94.455924,6.255364,0.715932,#4e79a7


In [35]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-1.4,1.4))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [42]:
# 
alt.Chart(temp).mark_bar(size=40).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()