<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/main/notebooks/Analysis%20NY%20Sampling%20Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis NY Sampling Bias

**Instructions:**

1. Execute the first code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [1]:
# Mount drive with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path

# Set path to data folder
path = Path('drive/MyDrive/big-data-project/data/clean-data')
city = 'ny'

**Spark SQL Documentation:** 
https://spark.apache.org/docs/2.2.0/sql-programming-guide.html

In [3]:
# Install required dependancies
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 73kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 41.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=8f52d84c838167634a584188cb28d245f07f66200f978c46c2db799e8bdac5aa
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1
The 

In [4]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [5]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
# # Demographic Data Chicago
# !ls drive/MyDrive/big-data-project/data/clean-data/ch/ch.csv

# !ls drive/MyDrive/big-data-project/data/clean-data/ch/social/2020/



In [8]:
# Read data into dataframe
df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'social/2020/*.csv'))
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_time|median_percentage_time_

In [9]:
# View schema
df_soc.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- date_range_end: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- distance_traveled_from_home: string (nullable = true)
 |-- bucketed_distance_traveled: string (nullable = true)
 |-- median_dwell_at_bucketed_distance_traveled: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)
 |-- median_home_dwell_time: string (nullable = true)
 |-- bucketed_home_dwell_time: string (nullable = true)
 |-- at_home_by_each_hour: string (nullable = true)
 |-- part_time_work_behavior_devices: string (nullable = true)
 |-- full_time_work_behavior_devices: string (nullable = true)
 |-- destination_cbgs: string (nullable = true)
 |-- delivery_behavior_devices: string (nullable = true)
 |-- median_non_home_dwell_time: string (nullable = true)
 |-- candidate_device_count: string (nullable = true)
 |-- bucketed_away_from_home_ti

In [10]:
# # Take small sample of data to experiment with
# sm = df.limit(100)
# sm.show()

In [11]:
df_soc.createOrReplaceTempView('T')
df_soc = spark.sql('SELECT cbg, date_range_start, device_count, completely_home_device_count FROM T')
df_soc.show()

+------------+--------------------+------------+----------------------------+
|         cbg|    date_range_start|device_count|completely_home_device_count|
+------------+--------------------+------------+----------------------------+
|360470064002|2020-02-03T00:00:...|          64|                          18|
|360810384001|2020-02-03T00:00:...|          84|                          21|
|360850170103|2020-02-03T00:00:...|         250|                          68|
|360050213021|2020-02-03T00:00:...|          92|                          26|
|360050253004|2020-02-03T00:00:...|          52|                          13|
|360470385003|2020-02-03T00:00:...|          37|                           9|
|360470406002|2020-02-03T00:00:...|          46|                          12|
|360470428003|2020-02-03T00:00:...|          84|                          20|
|360471130002|2020-02-03T00:00:...|          84|                          20|
|360811029001|2020-02-03T00:00:...|         256|                

In [12]:
# View schema
df_soc.printSchema()

root
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)



In [13]:
# Add completely home percentage column
df_soc = spark.sql('SELECT *, CAST(completely_home_device_count AS float)/CAST(device_count AS float)*100.0 AS completely_home_percentage FROM T')
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+--------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_t

In [14]:
df_soc.createOrReplaceTempView('t')
# get mean percentage for each cbg
df_soc = spark.sql(
'''
SELECT cbg, AVG(device_count) AS mean_device_count
FROM t
GROUP BY cbg
'''
)
df_soc.createOrReplaceTempView('t')
df_soc.show()

+------------+------------------+
|         cbg| mean_device_count|
+------------+------------------+
|360610070005|30.530054644808743|
|360050181024|52.166666666666664|
|360810152002| 64.59836065573771|
|360470800001|35.040983606557376|
|360610063005| 10.14968152866242|
|360810278002| 60.04644808743169|
|360470003011|13.891566265060241|
|360050076001|17.205479452054796|
|360810034002| 60.21857923497268|
|360050090001|  9.67032967032967|
|360470828001|44.702185792349724|
|360850169012|101.28142076502732|
|360050157003| 70.66939890710383|
|360810757022|57.513661202185794|
|360810838003| 54.76502732240437|
|360050413007| 41.33879781420765|
|360050393006| 45.15846994535519|
|360810040012| 37.37158469945355|
|360610048004|28.229508196721312|
|360471152001| 28.10928961748634|
+------------+------------------+
only showing top 20 rows



In [15]:
# Read poverty data
# Read data into dataframe
df_pov = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'ny.csv'))
df_pov.createOrReplaceTempView('d')
df_pov.show()

+------------+---------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|
+------------+---------+------------------+------------------+
|360050001000|        0|               0.0|               0.0|
|360050001001|     7503|               0.0|12.435025989604158|
|360050002000|        0|               0.0|               0.0|
|360050002001|     2114|16.130558183538316|35.856196783349105|
|360050002002|     2168|18.911439114391143| 48.06273062730627|
|360050002003|      969| 8.152734778121776|49.742002063983485|
|360050004000|        0|               0.0|               0.0|
|360050004001|      646|32.972136222910216| 44.27244582043344|
|360050004002|     1400|25.571428571428573| 75.35714285714286|
|360050004003|     3393| 1.569905213270142| 33.65753020925435|
|360050004004|      541|15.196998123827393| 20.33271719038817|
|360050016001|     3427|  25.1531952144733|38.400933761307265|
|360050016002|      662| 62.99093655589124| 73.56495468

In [16]:
# join mobility and poverty
result = spark.sql('SELECT d.*, t.mean_device_count FROM t INNER JOIN d ON t.cbg = d.cbg')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|
+------------+---------+------------------+------------------+------------------+
|360610070005|     1771|17.554125219426563| 66.00790513833992|30.530054644808743|
|360050181024|     1457|14.070006863417984| 6.314344543582704|52.166666666666664|
|360810152002|     1386| 16.81096681096681|26.767676767676768| 64.59836065573771|
|360470800001|      812|27.832512315270936| 38.42364532019704|35.040983606557376|
|360610063005|      884| 4.046242774566474| 85.85972850678732| 10.14968152866242|
|360810278002|     1796|32.962138084632514| 4.844097995545657| 60.04644808743169|
|360470003011|      661| 9.937888198757763| 77.60968229954615|13.891566265060241|
|360050076001|      631|27.403846153846157| 9.667194928684628|17.205479452054796|
|360810034002|     1604|           14.5625|28.179551122194514| 60.21857923497268|
|360050090001|  

In [17]:
# Count number of rows
spark.sql('SELECT COUNT(cbg) FROM t').show()

+----------+
|count(cbg)|
+----------+
|      6369|
+----------+



In [18]:
result = spark.sql('SELECT *, mean_device_count / pop_total * 100 AS sample_size_perc FROM t')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+
|360610070005|     1771|17.554125219426563| 66.00790513833992|30.530054644808743|1.7238878963754232|
|360050181024|     1457|14.070006863417984| 6.314344543582704|52.166666666666664| 3.580416380690917|
|360810152002|     1386| 16.81096681096681|26.767676767676768| 64.59836065573771| 4.660776382087858|
|360470800001|      812|27.832512315270936| 38.42364532019704|35.040983606557376| 4.315392069773076|
|360610063005|      884| 4.046242774566474| 85.85972850678732| 10.14968152866242|1.1481540190794592|
|360810278002|     1796|32.962138084632514| 4.844097995545657| 60.04644808743169|3.3433434347122324|
|360470003011|      661| 9.937888198757763| 77.60968229954615|13.891566265060241| 2.1015985

In [19]:
# sort by poverty
result = spark.sql('SELECT * FROM t ORDER BY CAST(sample_size_perc AS float) DESC')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+
|360610143001|        5|              75.0|              40.0| 550.4562841530054|11009.125683060109|
|360470852001|        8|               0.0|               0.0| 158.9863387978142|1987.3292349726776|
|360810219001|        4|               0.0|             100.0|27.778688524590162|  694.467213114754|
|360610217031|        8|               0.0|             100.0| 41.42896174863388| 517.8620218579235|
|360050093001|       24|               0.0|             100.0| 54.86065573770492| 228.5860655737705|
|360610212002|       46|               0.0| 56.52173913043478|103.48087431693989| 224.9584224281302|
|360050435001|       62|12.903225806451612|               0.0|120.80054644808743| 194.83959

In [20]:
# group by poverty range


In [57]:
result_df = result.toPandas()
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc
0,360610143001,5,75.0,40.0,550.456284,11009.125683
1,360470852001,8,0.0,0.0,158.986339,1987.329235
2,360810219001,4,0.0,100.0,27.778689,694.467213
3,360610217031,8,0.0,100.0,41.428962,517.862022
4,360050093001,24,0.0,100.0,54.860656,228.586066
...,...,...,...,...,...,...
6364,360810383011,0,0.0,0.0,7.176471,
6365,360810916021,0,0.0,0.0,8.080000,
6366,360610008000,0,0.0,0.0,20.951673,
6367,360610086010,0,0.0,0.0,268.445355,


In [58]:
result_df = result_df[result_df['pop_total'] != '0']
result_df['sample_size_perc'] = result_df['sample_size_perc'].clip(0,100)
result_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc
0,360610143001,5,75.0,40.0,550.456284,100.000000
1,360470852001,8,0.0,0.0,158.986339,100.000000
2,360810219001,4,0.0,100.0,27.778689,100.000000
3,360610217031,8,0.0,100.0,41.428962,100.000000
4,360050093001,24,0.0,100.0,54.860656,100.000000
...,...,...,...,...,...,...
6219,360470533004,1856,47.4676724137931,98.22198275862068,8.000000,0.431034
6220,360470531002,2702,67.24648408586232,94.15247964470763,11.269122,0.417066
6221,360470509002,2911,57.88388869804191,97.59532806595672,7.829787,0.268972
6222,360470531001,5128,46.08034321372855,98.84945397815913,10.984064,0.214198


In [59]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('poverty_percentage:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,50))),
    tooltip=['cbg','poverty_percentage','sample_size_perc']
).properties(width=300, height=300).interactive()

In [60]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
# temp['sample_size_perc'] = temp['sample_size_perc'].clip(0,100)
temp['color'] = ['#45a0d1' for x in temp['sample_size_perc']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,color
0,"(0, 20]",54.19438,4.270487,#45a0d1
1,"(20, 40]",54.907013,4.252324,#45a0d1
2,"(40, 60]",54.536252,4.184212,#45a0d1
3,"(60, 80]",47.36821,4.458847,#45a0d1
4,"(80, 100]",47.221166,3.498978,#45a0d1


In [62]:
# 
alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()

In [63]:
result_df['poverty_percentage'].astype(float).dtype

dtype('float64')

In [64]:
mean_sample_size = result_df['sample_size_perc'].mean()
mean_sample_size

4.484548226095217

In [65]:
result_df['sample_size_perc_deviation'] = result_df['sample_size_perc'] - mean_sample_size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [66]:
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc,sample_size_perc_deviation
0,360610143001,5,75.0,40.0,550.456284,100.000000,95.515452
1,360470852001,8,0.0,0.0,158.986339,100.000000,95.515452
2,360810219001,4,0.0,100.0,27.778689,100.000000,95.515452
3,360610217031,8,0.0,100.0,41.428962,100.000000,95.515452
4,360050093001,24,0.0,100.0,54.860656,100.000000,95.515452
...,...,...,...,...,...,...,...
6219,360470533004,1856,47.4676724137931,98.22198275862068,8.000000,0.431034,-4.053514
6220,360470531002,2702,67.24648408586232,94.15247964470763,11.269122,0.417066,-4.067482
6221,360470509002,2911,57.88388869804191,97.59532806595672,7.829787,0.268972,-4.215576
6222,360470531001,5128,46.08034321372855,98.84945397815913,10.984064,0.214198,-4.270350


In [67]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
# # Clamp values to 100%
# temp['sample_size_perc'] = temp['sample_size_perc'].clip(0,100)
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",54.19438,4.270487,-0.214062,#e15759
1,"(20, 40]",54.907013,4.252324,-0.232224,#e15759
2,"(40, 60]",54.536252,4.184212,-0.300337,#e15759
3,"(60, 80]",47.36821,4.458847,-0.025702,#e15759
4,"(80, 100]",47.221166,3.498978,-0.98557,#e15759


In [68]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-100,100))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [45]:
["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

['#4e79a7',
 '#f28e2c',
 '#e15759',
 '#76b7b2',
 '#59a14f',
 '#edc949',
 '#af7aa1',
 '#ff9da7',
 '#9c755f',
 '#bab0ab']

In [46]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('perc_whiteonly:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,10))),
    tooltip=['cbg','perc_whiteonly','sample_size_perc']
).properties(width=300, height=300).interactive()

In [47]:
temp = result_df.groupby(pd.cut(result_df['perc_whiteonly'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['perc_whiteonly'] = temp['perc_whiteonly'].astype(str)
temp
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,perc_whiteonly,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",56.353616,4.275696,-2.529206,#e15759
1,"(20, 40]",56.395515,14.369547,7.564645,#4e79a7
2,"(40, 60]",54.102507,4.40521,-2.399691,#e15759
3,"(60, 80]",51.145579,4.283898,-2.521004,#e15759
4,"(80, 100]",50.874681,6.01849,-0.786411,#e15759


In [48]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-8,8))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [49]:
# 
alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()