<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/main/notebooks/Analysis%20LA%20Sampling%20Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis LA Sampling Bias

**Instructions:**

1. Execute the first code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [1]:
# Mount drive with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path

# Set path to data folder
path = Path('drive/MyDrive/big-data-project/data/clean-data')
city = 'la'

**Spark SQL Documentation:** 
https://spark.apache.org/docs/2.2.0/sql-programming-guide.html

In [3]:
# Install required dependancies
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 69kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 42.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=abeab1c241c0064b5e85e416db0bdce46b979752b42eda85654c775984b5f817
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1
The 

In [4]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [5]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
# # Demographic Data Chicago
# !ls drive/MyDrive/big-data-project/data/clean-data/ch/ch.csv

# !ls drive/MyDrive/big-data-project/data/clean-data/ch/social/2020/



In [8]:
# Read data into dataframe
df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'social/2020/*.csv'))
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_time|median_percentage_time_

In [9]:
# View schema
df_soc.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- date_range_end: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- distance_traveled_from_home: string (nullable = true)
 |-- bucketed_distance_traveled: string (nullable = true)
 |-- median_dwell_at_bucketed_distance_traveled: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)
 |-- median_home_dwell_time: string (nullable = true)
 |-- bucketed_home_dwell_time: string (nullable = true)
 |-- at_home_by_each_hour: string (nullable = true)
 |-- part_time_work_behavior_devices: string (nullable = true)
 |-- full_time_work_behavior_devices: string (nullable = true)
 |-- destination_cbgs: string (nullable = true)
 |-- delivery_behavior_devices: string (nullable = true)
 |-- median_non_home_dwell_time: string (nullable = true)
 |-- candidate_device_count: string (nullable = true)
 |-- bucketed_away_from_home_ti

In [10]:
# # Take small sample of data to experiment with
# sm = df.limit(100)
# sm.show()

In [11]:
df_soc.createOrReplaceTempView('T')
df_soc = spark.sql('SELECT cbg, date_range_start, device_count, completely_home_device_count FROM T')
df_soc.show()

+------------+--------------------+------------+----------------------------+
|         cbg|    date_range_start|device_count|completely_home_device_count|
+------------+--------------------+------------+----------------------------+
|060371831013|2020-02-07T00:00:...|          45|                           6|
|060374034022|2020-02-07T00:00:...|         155|                          24|
|060375326041|2020-02-07T00:00:...|          31|                           4|
|060375544031|2020-02-07T00:00:...|         130|                          27|
|060375718003|2020-02-07T00:00:...|          35|                          10|
|060590018024|2020-02-07T00:00:...|          38|                          13|
|060590320463|2020-02-07T00:00:...|         184|                          24|
|060590626281|2020-02-07T00:00:...|         108|                          32|
|060371374011|2020-02-07T00:00:...|          72|                          30|
|060372398012|2020-02-07T00:00:...|          41|                

In [12]:
# View schema
df_soc.printSchema()

root
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)



In [13]:
# Add completely home percentage column
df_soc = spark.sql('SELECT *, CAST(completely_home_device_count AS float)/CAST(device_count AS float)*100.0 AS completely_home_percentage FROM T')
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+--------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_t

In [14]:
df_soc.createOrReplaceTempView('t')
# get mean percentage for each cbg
df_soc = spark.sql(
'''
SELECT cbg, AVG(device_count) AS mean_device_count
FROM t
GROUP BY cbg
'''
)
df_soc.createOrReplaceTempView('t')
df_soc.show()

+------------+------------------+
|         cbg| mean_device_count|
+------------+------------------+
|060371042031| 75.31147540983606|
|060374034051|62.959016393442624|
|060590758091| 60.44535519125683|
|060372123031|  87.5464480874317|
|060590744051|49.297814207650276|
|060371958032|16.174863387978142|
|060372145021|43.756830601092894|
|060590525061| 51.59289617486339|
|060376021032| 60.23224043715847|
|060372122022|28.404371584699454|
|060375743006| 45.86612021857923|
|060372948101| 34.67486338797814|
|060372198001| 23.81967213114754|
|060371244001|31.904371584699454|
|060375720012|32.032786885245905|
|060375701002| 68.69398907103825|
|060375401014| 64.40163934426229|
|060375540023| 31.15846994535519|
|060373107021|  71.2431693989071|
|060371064071| 57.45628415300546|
+------------+------------------+
only showing top 20 rows



In [15]:
# Read poverty data
# Read data into dataframe
df_pov = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'la.csv'))
df_pov.createOrReplaceTempView('d')
df_pov.show()

+------------+---------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|
+------------+---------+------------------+------------------+
|060373104003|     1286|17.884914463452567| 69.51788491446345|
|060590865023|     2545|22.789783889980352| 85.34381139489194|
|060376510024|     1285|               0.0|42.490272373540854|
|060376511011|     2708| 7.745398773006134| 50.70162481536189|
|060590994023|      672|12.969283276450511| 38.54166666666667|
|060590995094|      342| 17.83625730994152| 77.48538011695906|
|060376511012|     2483|2.6178010471204187|50.302053966975436|
|060376512011|      859|2.8846153846153846| 82.42142025611176|
|060376512012|     1179| 1.441899915182358|  74.1306191687871|
|060376512014|      989|2.4266936299292214| 70.57633973710819|
|060376512211|     3373|  9.95684340320592| 40.61666172546695|
|060590995092|      938| 5.756929637526652|  84.9680170575693|
|060590995095|      576|14.583333333333334|           7

In [16]:
# join mobility and poverty
result = spark.sql('SELECT d.*, t.mean_device_count FROM t INNER JOIN d ON t.cbg = d.cbg')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|
+------------+---------+------------------+------------------+------------------+
|060371042031|     2577|  24.3892828999212| 57.00426852929763| 75.31147540983606|
|060374034051|     1696| 1.474056603773585| 23.99764150943396|62.959016393442624|
|060590758091|     1025| 7.512195121951219| 66.63414634146342| 60.44535519125683|
|060372123031|     3420| 37.28963684676705| 19.12280701754386|  87.5464480874317|
|060590744051|     1752| 29.54944411936805| 73.63013698630137|49.297814207650276|
|060371958032|     1196| 28.26086956521739| 70.81939799331104|16.174863387978142|
|060372145021|     2135|27.775175644028106| 54.05152224824356|43.756830601092894|
|060590525061|      881| 8.286038592508513| 79.00113507377979| 51.59289617486339|
|060376021032|     2049|24.383983572895275| 41.19082479258175| 60.23224043715847|
|060372122022|  

In [17]:
# Count number of rows
spark.sql('SELECT COUNT(cbg) FROM t').show()

+----------+
|count(cbg)|
+----------+
|      8243|
+----------+



In [18]:
result = spark.sql('SELECT *, mean_device_count / pop_total * 100 AS sample_size_perc FROM t')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+
|060371042031|     2577|  24.3892828999212| 57.00426852929763| 75.31147540983606| 2.922447629407686|
|060374034051|     1696| 1.474056603773585| 23.99764150943396|62.959016393442624|3.7122061552737398|
|060590758091|     1025| 7.512195121951219| 66.63414634146342| 60.44535519125683| 5.897107823537252|
|060372123031|     3420| 37.28963684676705| 19.12280701754386|  87.5464480874317| 2.559837663375196|
|060590744051|     1752| 29.54944411936805| 73.63013698630137|49.297814207650276|2.8138021808019564|
|060371958032|     1196| 28.26086956521739| 70.81939799331104|16.174863387978142|1.3524133267540253|
|060372145021|     2135|27.775175644028106| 54.05152224824356|43.756830601092894|2.04950026

In [19]:
# sort by poverty
result = spark.sql('SELECT * FROM t ORDER BY CAST(sample_size_perc AS float) DESC')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly| mean_device_count|  sample_size_perc|
+------------+---------+------------------+------------------+------------------+------------------+
|060599800001|       25|               0.0|              72.0|196.44535519125682| 785.7814207650273|
|060379800091|        5|               0.0|              20.0| 36.13934426229508| 722.7868852459017|
|060379800231|        9|               0.0|               0.0|20.823321554770317|231.37023949744795|
|060375324001|       88|46.590909090909086|15.909090909090908|139.47267759562843| 158.4916790859414|
|060379108111|      121|  7.43801652892562|             100.0|160.56830601092895|132.70107934787515|
|060373108002|      675| 9.925925925925926|57.333333333333336|  523.688524590164| 77.58348512446874|
|060371975003|      434| 3.686635944700461|58.294930875576036|333.55464480874315| 76.855908

In [20]:
# group by poverty range


In [21]:
result_df = result.toPandas()
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc
0,060599800001,25,0.0,72.0,196.445355,785.781421
1,060379800091,5,0.0,20.0,36.139344,722.786885
2,060379800231,9,0.0,0.0,20.823322,231.370239
3,060375324001,88,46.590909090909086,15.909090909090908,139.472678,158.491679
4,060379108111,121,7.43801652892562,100.0,160.568306,132.701079
...,...,...,...,...,...,...
8238,060374002074,0,0.0,0.0,49.379085,
8239,060590219241,0,0.0,0.0,9.937500,
8240,060375409024,0,0.0,0.0,18.185792,
8241,060375734021,0,0.0,0.0,15.620290,


In [22]:
result_df = result_df[result_df['pop_total'] != '0']
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc
0,060599800001,25,0.0,72.0,196.445355,785.781421
1,060379800091,5,0.0,20.0,36.139344,722.786885
2,060379800231,9,0.0,0.0,20.823322,231.370239
3,060375324001,88,46.590909090909086,15.909090909090908,139.472678,158.491679
4,060379108111,121,7.43801652892562,100.0,160.568306,132.701079
...,...,...,...,...,...,...
8209,060371973003,1543,20.414776409591703,59.04082955281918,13.288462,0.861209
8210,060590761022,3449,0.0,83.35749492606553,23.046448,0.668207
8211,060379202001,5799,0.0,38.64459389549923,33.879630,0.584232
8212,060372060201,7941,12.981744421906694,45.145447676615035,37.696721,0.474710


In [23]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('poverty_percentage:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,50))),
    tooltip=['cbg','poverty_percentage','sample_size_perc']
).properties(width=300, height=300).interactive()

In [37]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
temp
temp['color'] = ['#45a0d1' for x in temp['sample_size_perc']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",65.24719,4.30744,0.006799,#45a0d1
1,"(20, 40]",57.112868,3.378405,-0.922236,#45a0d1
2,"(40, 60]",49.234403,3.873327,-0.427315,#45a0d1
3,"(60, 80]",44.830111,3.064268,-1.236374,#45a0d1
4,"(80, 100]",57.793431,2.868673,-1.431969,#45a0d1


In [38]:
# 
alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()

In [26]:
result_df['poverty_percentage'].astype(float).dtype

dtype('float64')

In [27]:
mean_sample_size = result_df['sample_size_perc'].mean()
mean_sample_size

4.300641481150165

In [28]:
result_df['sample_size_perc_deviation'] = result_df['sample_size_perc'] - mean_sample_size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_device_count,sample_size_perc,sample_size_perc_deviation
0,060599800001,25,0.0,72.0,196.445355,785.781421,781.480779
1,060379800091,5,0.0,20.0,36.139344,722.786885,718.486244
2,060379800231,9,0.0,0.0,20.823322,231.370239,227.069598
3,060375324001,88,46.590909090909086,15.909090909090908,139.472678,158.491679,154.191038
4,060379108111,121,7.43801652892562,100.0,160.568306,132.701079,128.400438
...,...,...,...,...,...,...,...
8209,060371973003,1543,20.414776409591703,59.04082955281918,13.288462,0.861209,-3.439432
8210,060590761022,3449,0.0,83.35749492606553,23.046448,0.668207,-3.632435
8211,060379202001,5799,0.0,38.64459389549923,33.879630,0.584232,-3.716409
8212,060372060201,7941,12.981744421906694,45.145447676615035,37.696721,0.474710,-3.825931


In [30]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)
temp
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,poverty_percentage,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",65.24719,4.30744,0.006799,#4e79a7
1,"(20, 40]",57.112868,3.378405,-0.922236,#e15759
2,"(40, 60]",49.234403,3.873327,-0.427315,#e15759
3,"(60, 80]",44.830111,3.064268,-1.236374,#e15759
4,"(80, 100]",57.793431,2.868673,-1.431969,#e15759


In [31]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('poverty_percentage:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-1.4,1.4))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [32]:
["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

['#4e79a7',
 '#f28e2c',
 '#e15759',
 '#76b7b2',
 '#59a14f',
 '#edc949',
 '#af7aa1',
 '#ff9da7',
 '#9c755f',
 '#bab0ab']

In [33]:
# plot bar graph
alt.Chart(result_df).mark_point(opacity=0.3).encode(
    alt.X('perc_whiteonly:Q'),
    alt.Y('sample_size_perc:Q',scale=alt.Scale(domain=(0,10))),
    tooltip=['cbg','perc_whiteonly','sample_size_perc']
).properties(width=300, height=300).interactive()

In [34]:
temp = result_df.groupby(pd.cut(result_df['perc_whiteonly'].astype(float), np.arange(0,120, 20))).mean().reset_index()
temp['perc_whiteonly'] = temp['perc_whiteonly'].astype(str)
temp
temp['color'] = ['#4e79a7' if x > 0 else '#e15759'  for x in temp['sample_size_perc_deviation']]
temp

Unnamed: 0,perc_whiteonly,mean_device_count,sample_size_perc,sample_size_perc_deviation,color
0,"(0, 20]",53.286861,5.032224,0.731582,#4e79a7
1,"(20, 40]",60.269258,3.660509,-0.640133,#e15759
2,"(40, 60]",63.728044,3.962357,-0.338285,#e15759
3,"(60, 80]",64.9634,4.492985,0.192343,#4e79a7
4,"(80, 100]",58.68946,4.718513,0.417871,#4e79a7


In [35]:
# 
title = '{} {}'.format(city,year)

chart = alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc_deviation', scale=alt.Scale(domain=(-1.4,1.4))),
    color = alt.Color('color', scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc_deviation')]
).properties(width=300, height=300).interactive()

line = alt.Chart(pd.DataFrame({'percentage_completely_home_deviation':[0]})).mark_rule(color='#757575',strokeDash=[5,3], size=2).encode(
    y='percentage_completely_home_deviation:Q',
    # color=alt.value("#757575"),
    size=alt.value(2),
)

( chart + line).interactive()

In [36]:
# 
alt.Chart(temp).mark_bar(size=50).encode(
    alt.X('perc_whiteonly:N'),
    alt.Y('sample_size_perc'),
    # color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('perc_whiteonly'),
             alt.Tooltip('sample_size_perc')]
).properties(width=300, height=300).interactive()