<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/main/notebooks/Analysis%20CH%20poverty%20and%20mobility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis CH poverty and mobility

**Instructions:**

1. Execute the first code cell.
2. There will be a link to follow in order to authorize the google account for drive. Go to that link.
3. A code to authorize the google account will be generated. Copy the code generated.
4. Go back to the cell where the process of mounting the drive is running. Paste the generated code from step 3 to the text box in the cell and press enter.

In [None]:
# Mount drive with data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path

# Set path to data folder
path = Path('drive/MyDrive/big-data-project/data/clean-data')
city = 'ch'

**Spark SQL Documentation:** 
https://spark.apache.org/docs/2.2.0/sql-programming-guide.html

In [None]:
# Install required dependancies
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u282-b08-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [None]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [None]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

ValueError: ignored

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Demographic Data Chicago
!ls drive/MyDrive/big-data-project/data/clean-data/ch/ch.csv

!ls drive/MyDrive/big-data-project/data/clean-data/ch/social/2020/



drive/MyDrive/big-data-project/data/clean-data/ch/ch.csv
2020-01-01-social-distancing.csv  2020-07-02-social-distancing.csv
2020-01-02-social-distancing.csv  2020-07-03-social-distancing.csv
2020-01-03-social-distancing.csv  2020-07-04-social-distancing.csv
2020-01-04-social-distancing.csv  2020-07-05-social-distancing.csv
2020-01-05-social-distancing.csv  2020-07-06-social-distancing.csv
2020-01-06-social-distancing.csv  2020-07-07-social-distancing.csv
2020-01-07-social-distancing.csv  2020-07-08-social-distancing.csv
2020-01-08-social-distancing.csv  2020-07-09-social-distancing.csv
2020-01-09-social-distancing.csv  2020-07-10-social-distancing.csv
2020-01-10-social-distancing.csv  2020-07-11-social-distancing.csv
2020-01-11-social-distancing.csv  2020-07-12-social-distancing.csv
2020-01-12-social-distancing.csv  2020-07-13-social-distancing.csv
2020-01-13-social-distancing.csv  2020-07-14-social-distancing.csv
2020-01-14-social-distancing.csv  2020-07-15-social-distancing.csv
2020-

In [None]:
# Read data into dataframe
df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'social/2020/*.csv'))
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_time|median_percentage_time_

In [None]:
# View schema
df_soc.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- date_range_end: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- distance_traveled_from_home: string (nullable = true)
 |-- bucketed_distance_traveled: string (nullable = true)
 |-- median_dwell_at_bucketed_distance_traveled: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)
 |-- median_home_dwell_time: string (nullable = true)
 |-- bucketed_home_dwell_time: string (nullable = true)
 |-- at_home_by_each_hour: string (nullable = true)
 |-- part_time_work_behavior_devices: string (nullable = true)
 |-- full_time_work_behavior_devices: string (nullable = true)
 |-- destination_cbgs: string (nullable = true)
 |-- delivery_behavior_devices: string (nullable = true)
 |-- median_non_home_dwell_time: string (nullable = true)
 |-- candidate_device_count: string (nullable = true)
 |-- bucketed_away_from_home_ti

In [None]:
# # Take small sample of data to experiment with
# sm = df.limit(100)
# sm.show()

In [None]:
df_soc.createOrReplaceTempView('T')
df_soc = spark.sql('SELECT cbg, date_range_start, device_count, completely_home_device_count FROM T')
df_soc.show()

+------------+--------------------+------------+----------------------------+
|         cbg|    date_range_start|device_count|completely_home_device_count|
+------------+--------------------+------------+----------------------------+
|170312304002|2020-02-07T00:00:...|          76|                          22|
|170313005002|2020-02-07T00:00:...|          72|                          27|
|170314208001|2020-02-07T00:00:...|         127|                          33|
|170317608034|2020-02-07T00:00:...|         114|                          37|
|170318033003|2020-02-07T00:00:...|          91|                          29|
|170318146005|2020-02-07T00:00:...|          51|                          11|
|170318182002|2020-02-07T00:00:...|          51|                          19|
|170318240053|2020-02-07T00:00:...|         123|                          21|
|170318348001|2020-02-07T00:00:...|          32|                          12|
|170438411042|2020-02-07T00:00:...|          57|                

In [None]:
# View schema
df_soc.printSchema()

root
 |-- cbg: string (nullable = true)
 |-- date_range_start: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)



In [None]:
# Add completely home percentage column
df_soc = spark.sql('SELECT *, CAST(completely_home_device_count AS float)/CAST(device_count AS float)*100.0 AS completely_home_percentage FROM T')
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+--------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_t

In [None]:
df_soc.createOrReplaceTempView('t')
# get mean percentage for each cbg
df_soc = spark.sql(
'''
SELECT cbg, AVG(completely_home_percentage) AS mean_completely_home_percentage
FROM t
GROUP BY cbg
'''
)
df_soc.createOrReplaceTempView('t')
df_soc.show()

+------------+-------------------------------+
|         cbg|mean_completely_home_percentage|
+------------+-------------------------------+
|170318279025|              26.40084484101243|
|170314202001|              38.48855537853975|
|170314401021|             36.938511683200765|
|170310103003|             43.257215411667474|
|170310608001|              39.43156922739539|
|171978803141|              32.22826728661204|
|170314302003|             36.837787468268566|
|170438462012|             31.762883529365208|
|170978614041|             29.481587396404358|
|170318042012|              35.17617417180933|
|180890302002|             20.795247108576273|
|170312504001|             29.679092940693835|
|170313012003|               40.3213296693814|
|170318256001|             31.607567287734227|
|170313012002|             29.701345255401677|
|170318433001|              31.80115393385439|
|170978610123|             26.796102135722183|
|170315203004|             30.797249772640868|
|171118708071

In [None]:
# Read poverty data
# Read data into dataframe
df_pov = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'ch.csv'))
df_pov.createOrReplaceTempView('d')
df_pov.show()

+------------+---------+------------------+-------------------+
|         cbg|pop_total|poverty_percentage|     perc_whiteonly|
+------------+---------+------------------+-------------------+
|170312909001|      935|  55.8288770053476|                0.0|
|170312909002|     1129|45.261293179805136|                0.0|
|170312909003|      765|  39.7078353253652|                0.0|
|170312909004|     1075|30.325581395348838|                0.0|
|170318387002|     1016|31.003937007874015| 13.484251968503939|
|170313102001|      835|16.766467065868262|  78.20359281437126|
|170313102002|      686| 16.61807580174927| 50.583090379008745|
|170314005002|      807|40.644361833952914|                0.0|
|170314804006|      710| 1.971830985915493|  1.267605633802817|
|170314804007|      650|               0.0|                0.0|
|170314805001|     1141|  9.55302366345311|  4.206836108676599|
|170312608002|     1100| 55.45454545454545| 2.8181818181818183|
|170312609001|      836| 44.856459330143

In [None]:
# join mobility and poverty
result = spark.sql('SELECT d.*, t.mean_completely_home_percentage FROM t INNER JOIN d ON t.cbg = d.cbg')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+-------------------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|mean_completely_home_percentage|
+------------+---------+------------------+------------------+-------------------------------+
|170318279025|     1127|20.496894409937887| 58.47382431233363|              26.40084484101243|
|170314202001|     1720| 32.93571901921803|21.627906976744185|              38.48855537853975|
|170314401021|     1165| 35.79399141630901|12.188841201716738|             36.938511683200765|
|170310103003|     1291| 40.63745019920319| 64.52362509682416|             43.257215411667474|
|170310608001|     1770| 5.367231638418079| 79.03954802259886|              39.43156922739539|
|171978803141|     1780|2.8651685393258424| 81.23595505617978|              32.22826728661204|
|170314302003|      730| 16.43835616438356|               0.0|             36.837787468268566|
|170438462012|     1375| 7.054545454545455| 87.127

In [None]:
# Count number of rows
spark.sql('SELECT COUNT(cbg) FROM t').show()

+----------+
|count(cbg)|
+----------+
|      6664|
+----------+



In [None]:
# sort by poverty
result = spark.sql('SELECT * FROM t ORDER BY CAST(poverty_percentage AS float) DESC')
result.createOrReplaceTempView('t')
result.show()

+------------+---------+------------------+------------------+-------------------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|mean_completely_home_percentage|
+------------+---------+------------------+------------------+-------------------------------+
|170315401014|      305| 92.78688524590164|               0.0|              25.83929758774942|
|170898503011|     1739| 89.41920644048304| 17.30879815986199|             31.542277696074354|
|180890412003|      441| 84.35374149659864| 80.04535147392289|              24.15400649817893|
|170370010011|     1558| 82.72108843537414|19.319640564826702|              36.29779396430023|
|180890206001|     1079| 82.39110287303059|17.701575532900833|              38.30759545442372|
|170315401013|     1303| 82.34842670759785|               0.0|             32.968453228334205|
|170310804002|      550| 82.18181818181817| 2.727272727272727|              35.62968810513307|
|170910116001|      754| 81.03448275862068|29.7082

In [None]:
# group by poverty range


In [None]:
result_df = result.toPandas()
result_df

Unnamed: 0,cbg,pop_total,poverty_percentage,perc_whiteonly,mean_completely_home_percentage
0,170315401014,305,92.78688524590164,0.0,25.839298
1,170898503011,1739,89.41920644048304,17.30879815986199,31.542278
2,180890412003,441,84.35374149659864,80.04535147392289,24.154006
3,170370010011,1558,82.72108843537414,19.319640564826702,36.297794
4,180890206001,1079,82.39110287303059,17.701575532900833,38.307595
...,...,...,...,...,...
6659,170318005003,980,0.0,95.10204081632652,27.704363
6660,170319900000,0,0.0,0.0,22.866347
6661,170318013003,1138,0.0,93.05799648506151,24.875528
6662,170438426021,1395,0.0,92.25806451612904,28.222471


In [None]:
# plot bar graph
alt.Chart(result_df).mark_point().encode(
    alt.X('poverty_percentage:Q'),
    alt.Y('mean_completely_home_percentage:Q'),
    tooltip=['cbg','poverty_percentage','mean_completely_home_percentage']
).properties(width=400, height=400).interactive()

In [None]:
temp = result_df.groupby(pd.cut(result_df['poverty_percentage'].astype(float), np.arange(0,110, 10))).mean().reset_index()

In [None]:
temp['poverty_percentage'] = temp['poverty_percentage'].astype(str)

In [None]:
temp['color'] = ['#45a0d1' for x in temp['mean_completely_home_percentage']]
temp

Unnamed: 0,poverty_percentage,mean_completely_home_percentage,color
0,"(0, 10]",32.335266,#45a0d1
1,"(10, 20]",33.162297,#45a0d1
2,"(20, 30]",33.815477,#45a0d1
3,"(30, 40]",34.012766,#45a0d1
4,"(40, 50]",34.205546,#45a0d1
5,"(50, 60]",33.222668,#45a0d1
6,"(60, 70]",33.753639,#45a0d1
7,"(70, 80]",32.937377,#45a0d1
8,"(80, 90]",32.322277,#45a0d1
9,"(90, 100]",25.839298,#ff4833


In [None]:
# 
alt.Chart(temp).mark_point().encode(
    alt.X('poverty_percentage:N'),
    alt.Y('mean_completely_home_percentage'),
    color = alt.Color('color',scale=None),
    tooltip=[alt.Tooltip('poverty_percentage'),
             alt.Tooltip('mean_completely_home_percentage')]
).properties(width=400, height=400).interactive()

In [None]:
result_df['poverty_percentage'].astype(float).dtype

dtype('float64')