<a href="https://colab.research.google.com/github/chouhandiksha/bigdataproject/blob/colab/notebooks/analyse_LA_mobility_poverty_time_series_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Spark SQL Documentation:** 
https://spark.apache.org/docs/2.2.0/sql-programming-guide.html

In [205]:
# add time information at the end of every cell
!pip install ipython-autotime
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 5.87 s (started: 2021-04-20 03:45:25 +00:00)


In [206]:
# Install required dependancies
!pip install pyspark
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u282-b08-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.
time: 6.28 s (started: 2021-04-20 03:45:31 +00:00)


In [207]:
# Import modules
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

time: 11.4 ms (started: 2021-04-20 03:45:37 +00:00)


In [211]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

time: 5.1 ms (started: 2021-04-20 03:45:54 +00:00)


In [212]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

time: 335 ms (started: 2021-04-20 03:45:54 +00:00)


In [213]:
#sc.stop()

time: 894 µs (started: 2021-04-20 03:45:54 +00:00)


In [214]:
# Mount drive with data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 6.92 ms (started: 2021-04-20 03:45:54 +00:00)


In [215]:
# Set path to data folder
path = Path('drive/MyDrive/big-data-project/data/clean-data')
city = 'la'

time: 3.22 ms (started: 2021-04-20 03:45:54 +00:00)


In [216]:
# Read data into dataframe
df_soc = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'social/2020/*.csv'))
df_soc.show()

+---+------------+--------------------+--------------------+------------+---------------------------+--------------------------+------------------------------------------+----------------------------+----------------------+------------------------+--------------------+-------------------------------+-------------------------------+--------------------+-------------------------+--------------------------+----------------------+----------------------------+---------------------------+-----------------------------+
|_c0|         cbg|    date_range_start|      date_range_end|device_count|distance_traveled_from_home|bucketed_distance_traveled|median_dwell_at_bucketed_distance_traveled|completely_home_device_count|median_home_dwell_time|bucketed_home_dwell_time|at_home_by_each_hour|part_time_work_behavior_devices|full_time_work_behavior_devices|    destination_cbgs|delivery_behavior_devices|median_non_home_dwell_time|candidate_device_count|bucketed_away_from_home_time|median_percentage_time_

In [217]:
df_soc.createOrReplaceTempView('clean_la')
df_soc = spark.sql('SELECT cbg, date_range_start as date, device_count, completely_home_device_count, part_time_work_behavior_devices, full_time_work_behavior_devices FROM clean_la WHERE device_count > 5')
df_soc.show()

+------------+--------------------+------------+----------------------------+-------------------------------+-------------------------------+
|         cbg|                date|device_count|completely_home_device_count|part_time_work_behavior_devices|full_time_work_behavior_devices|
+------------+--------------------+------------+----------------------------+-------------------------------+-------------------------------+
|060371831013|2020-02-07T00:00:...|          45|                           6|                              1|                              1|
|060374034022|2020-02-07T00:00:...|         155|                          24|                             12|                              4|
|060375326041|2020-02-07T00:00:...|          31|                           4|                              6|                              2|
|060375544031|2020-02-07T00:00:...|         130|                          27|                             16|                             10|
|06037

In [218]:
df_soc = df_soc.withColumn("date",
    df_soc['date'].substr(0, 10))

time: 21.5 ms (started: 2021-04-20 03:46:04 +00:00)


In [219]:
df_soc.show()

+------------+----------+------------+----------------------------+-------------------------------+-------------------------------+
|         cbg|      date|device_count|completely_home_device_count|part_time_work_behavior_devices|full_time_work_behavior_devices|
+------------+----------+------------+----------------------------+-------------------------------+-------------------------------+
|060371831013|2020-02-07|          45|                           6|                              1|                              1|
|060374034022|2020-02-07|         155|                          24|                             12|                              4|
|060375326041|2020-02-07|          31|                           4|                              6|                              2|
|060375544031|2020-02-07|         130|                          27|                             16|                             10|
|060375718003|2020-02-07|          35|                          10|         

In [220]:
# Add completely home percentage column
df_soc = df_soc.withColumn('completely_home_percentage', (df_soc['completely_home_device_count']/df_soc['device_count']) * 100)
df_soc = df_soc.withColumn('part_time_work_percentage', (df_soc['part_time_work_behavior_devices']/df_soc['device_count']) * 100)
df_soc = df_soc.withColumn('full_time_work_percentage', (df_soc['full_time_work_behavior_devices']/df_soc['device_count']) * 100)

#df_soc.show()

time: 58 ms (started: 2021-04-20 03:46:04 +00:00)


In [221]:
df_soc.printSchema()

root
 |-- cbg: string (nullable = true)
 |-- date: string (nullable = true)
 |-- device_count: string (nullable = true)
 |-- completely_home_device_count: string (nullable = true)
 |-- part_time_work_behavior_devices: string (nullable = true)
 |-- full_time_work_behavior_devices: string (nullable = true)
 |-- completely_home_percentage: double (nullable = true)
 |-- part_time_work_percentage: double (nullable = true)
 |-- full_time_work_percentage: double (nullable = true)

time: 3.06 ms (started: 2021-04-20 03:46:04 +00:00)


In [222]:
#Create temp view
df_soc.createOrReplaceTempView('mobility')

time: 20.6 ms (started: 2021-04-20 03:46:04 +00:00)




---


demographic data






---



In [223]:
# Read poverty data
# Read data into RDD
df_demographic = spark.read.format('csv').option('header','true').option('quote',"\"").option('escape',"\"").load(str(path/city/'la.csv'))
df_demographic.createOrReplaceTempView('demographic')
df_demographic.show()

+------------+---------+------------------+------------------+
|         cbg|pop_total|poverty_percentage|    perc_whiteonly|
+------------+---------+------------------+------------------+
|060373104003|     1286|17.884914463452567| 69.51788491446345|
|060590865023|     2545|22.789783889980352| 85.34381139489194|
|060376510024|     1285|               0.0|42.490272373540854|
|060376511011|     2708| 7.745398773006134| 50.70162481536189|
|060590994023|      672|12.969283276450511| 38.54166666666667|
|060590995094|      342| 17.83625730994152| 77.48538011695906|
|060376511012|     2483|2.6178010471204187|50.302053966975436|
|060376512011|      859|2.8846153846153846| 82.42142025611176|
|060376512012|     1179| 1.441899915182358|  74.1306191687871|
|060376512014|      989|2.4266936299292214| 70.57633973710819|
|060376512211|     3373|  9.95684340320592| 40.61666172546695|
|060590995092|      938| 5.756929637526652|  84.9680170575693|
|060590995095|      576|14.583333333333334|           7

In [224]:
df_mob_demo = spark.sql('SELECT mobility.*, demographic.poverty_percentage from mobility INNER JOIN demographic ON mobility.cbg = demographic.cbg')
df_mob_demo.createOrReplaceTempView('demographic_mobility')
df_mob_demo.show()


+------------+----------+------------+----------------------------+-------------------------------+-------------------------------+--------------------------+-------------------------+-------------------------+------------------+
|         cbg|      date|device_count|completely_home_device_count|part_time_work_behavior_devices|full_time_work_behavior_devices|completely_home_percentage|part_time_work_percentage|full_time_work_percentage|poverty_percentage|
+------------+----------+------------+----------------------------+-------------------------------+-------------------------------+--------------------------+-------------------------+-------------------------+------------------+
|060371831013|2020-02-07|          45|                           6|                              1|                              1|        13.333333333333334|       2.2222222222222223|       2.2222222222222223|15.450643776824036|
|060374034022|2020-02-07|         155|                          24|             

In [225]:
# DIVIDING THE PEOPLE TOP 20 PERCENT AND BOTTOM 20 PERCENT

df_mob_demo_high_class =  spark.sql('SELECT * FROM demographic_mobility WHERE poverty_percentage < 20')
grouped_df_mob_demo_high_class = df_mob_demo_high_class.groupBy("date").mean("completely_home_percentage").withColumnRenamed('avg(completely_home_percentage)','completely_home_percentage')
grouped_df_mob_demo_high_class.createOrReplaceTempView('mob_demo_high_class')


df_mob_demo_low_class =  spark.sql('SELECT * FROM demographic_mobility WHERE poverty_percentage > 80')
grouped_df_mob_demo_low_class = df_mob_demo_low_class.groupBy("date").mean("completely_home_percentage").withColumnRenamed('avg(completely_home_percentage)','completely_home_percentage')
grouped_df_mob_demo_low_class.createOrReplaceTempView('mob_demo_low_class')

grouped_df_mob_demo_combined = spark.sql("""

SELECT h.date,h.completely_home_percentage as high_comp_home_perc,
l.completely_home_percentage as low_comp_home_perc  
FROM mob_demo_high_class h INNER JOIN mob_demo_low_class l ON h.date = l.date

""")


time: 140 ms (started: 2021-04-20 03:46:05 +00:00)


---
Visualization 

---

In [247]:
df = grouped_df_mob_demo_combined.toPandas()
df

Unnamed: 0,date,high_comp_home_perc,low_comp_home_perc
0,2020-02-26,17.297551,17.028372
1,2020-04-13,49.179507,27.580760
2,2020-06-24,33.002194,36.514949
3,2020-06-08,36.723946,40.887284
4,2020-09-12,33.388659,29.334402
...,...,...,...
361,2020-04-05,51.744280,27.824742
362,2020-05-01,43.087875,43.937332
363,2020-10-25,35.361961,32.028754
364,2020-12-28,41.489642,24.049980


time: 1.76 s (started: 2021-04-20 03:57:01 +00:00)


In [248]:
df_str = df

time: 1.39 ms (started: 2021-04-20 03:57:04 +00:00)


In [249]:
base = alt.Chart(df_str.reset_index()).encode(x='date'
)

alt.layer(
    base.mark_line(color='green').encode(y='high_comp_home_perc'),
    base.mark_line(color='red').encode(y='low_comp_home_perc')

)

time: 44.8 ms (started: 2021-04-20 03:57:11 +00:00)


Filtering based on important dates

Thursday, March 19, 2020: 
Statewide Stay-at-Home Order Issued











In [250]:
#Conversion of datetime format
df['date'] = pd.to_datetime(df['date'],utc= True)
df['date'] = [datetime.datetime.date(d) for d in df['date']] 

time: 5.44 ms (started: 2021-04-20 03:57:15 +00:00)


In [252]:
df1 = df[(df['date']>datetime.date(2020,3,17)) & (df['date']<datetime.date(2020,4,11))]  
df1['date'] = df1['date'].astype(str)

time: 6.74 ms (started: 2021-04-20 03:58:02 +00:00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [254]:
base = alt.Chart(df1.reset_index()).encode(x='date'
).properties(width = 1000)
alt.layer(
    base.mark_line(color='green').encode(y='high_comp_home_perc'),
    base.mark_line(color='red').encode(y='low_comp_home_perc')
).interactive()

time: 43.2 ms (started: 2021-04-20 03:58:35 +00:00)


Aug. 7, 2020:
LA County Exceeds 200,000 Coronavirus Cases

Aug. 12, 2020:
LA County Crosses 'Tragic Milestone' Of 5,000 Coronavirus Deaths 


In [231]:
df2 = df[(df['date']>datetime.date(2020,8,5)) & (df['date']<datetime.date(2020,8,19))]  
df2['date'] = df2['date'].astype(str)

time: 13 ms (started: 2021-04-20 03:49:19 +00:00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [232]:
base = alt.Chart(df2.reset_index()).encode(x='date'
).properties(width = 1000)
alt.layer(
    base.mark_line(color='green').encode(y='high_comp_home_perc'),
    base.mark_line(color='red').encode(y='low_comp_home_perc')
).interactive()

time: 79.5 ms (started: 2021-04-20 03:49:19 +00:00)


**Oct. 14, 2020: LA County Sees Rise In Workplace Outbreaks As Infection Rate Creeps Upward**

--- 

In [233]:
df3 = df[(df['date']>datetime.date(2020,10,12)) & (df['date']<datetime.date(2020,10,26))]  
df3['date'] = df3['date'].astype(str)

time: 13.3 ms (started: 2021-04-20 03:49:19 +00:00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [234]:
base = alt.Chart(df3.reset_index()).encode(x='date'
).properties(width = 1000)
alt.layer(
    base.mark_line(color='green').encode(y='high_comp_home_perc'),
    base.mark_line(color='red').encode(y='low_comp_home_perc')
).interactive()

time: 59.6 ms (started: 2021-04-20 03:49:19 +00:00)


Dec. 16, 2020:
LA County COVID-19 Deaths Hit New Record

---

In [257]:
df4 = df[(df['date']>datetime.date(2020,12,14)) & (df['date']<datetime.date(2020,12,28))]  
df4['date'] = df4['date'].astype(str)

time: 4.78 ms (started: 2021-04-20 04:11:23 +00:00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [258]:
base = alt.Chart(df4.reset_index()).encode(x='date'
).properties(width = 1000)
alt.layer(
    base.mark_line(color='green').encode(y='high_comp_home_perc'),
    base.mark_line(color='red').encode(y='low_comp_home_perc')
).interactive()

time: 40.7 ms (started: 2021-04-20 04:11:25 +00:00)
