In [1]:
!/usr/local/Cellar/jupyterlab/3.0.14/libexec/bin/pip3 install pyspark



In [2]:
!which python3

/usr/local/bin/python3


In [1]:
from pyspark import SparkConf, SparkContext, SparkFiles
from pyspark.sql import *


config_prefix = "ampba.batch15.bdm2.group_assignment"

spark = SparkSession.builder \
.master("local") \
.appName("BDM2-GroupAssignment") \
.config(config_prefix+".raw_covid_datasource", "https://api.covid19india.org/csv/latest/raw_data{file_index}.csv") \
.config(config_prefix+".raw_covid_datasource_file_upto", 27) \
.config(config_prefix+".covid_datasource", "https://api.covid19india.org/csv/latest/districts.csv") \
.config(config_prefix+".stock_datasource", "https://docs.google.com/spreadsheets/d/1sNXNbIrOSU6jdJwFHOY6FGGM8PsJrNdkWf_70WOPtjc/export?format=csv") \
.getOrCreate()


In [2]:
spark

In [3]:
# spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource_file_upto')

#Reading each RAW file - and saving in RAW_DATA_DF
raw_data_stores=[]
def fetch_and_load_raw_data(f_now,file_index):
    print("Fetching data from",f_now,"...")
    spark.sparkContext.addFile(f_now)
    print("File loaded, now adding in Dataframe...")    
    return spark.read.option("header", "true").csv(SparkFiles.get("raw_data{file_index}.csv".format(file_index=file_index)))

print("Fetching first raw data file")

f_now = spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource').format(file_index=1)
# print(f_now)
raw_data_stores.append(fetch_and_load_raw_data(f_now,1))

for file_index in range(2, int(spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource_file_upto'))):
    f_now = spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource').format(file_index=file_index)  
    raw_data_stores.append(fetch_and_load_raw_data(f_now,file_index))

Fetching first raw data file
Fetching data from https://api.covid19india.org/csv/latest/raw_data1.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data2.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data3.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data4.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data5.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data6.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data7.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data8.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covi

In [4]:
# raw_data_stores[0].printSchema()

cols_of_interest=['Date Announced','Detected State','Detected District','Gender','Age Bracket','Detected City','Nationality','Current Status','Status Change Date','Num Cases']

merged_data =raw_data_stores[0].select(cols_of_interest)
for i in range(1,len(raw_data_stores)):
    print("Now working on raw data",i)
    df = raw_data_stores[i].select(cols_of_interest)
    print("Obtained rows",df.count())
    merged_data=merged_data.union(df)

Now working on raw data 1
Obtained rows 10819
Now working on raw data 2
Obtained rows 10020
Now working on raw data 3
Obtained rows 18231
Now working on raw data 4
Obtained rows 20488
Now working on raw data 5
Obtained rows 23423
Now working on raw data 6
Obtained rows 22770
Now working on raw data 7
Obtained rows 22808
Now working on raw data 8
Obtained rows 26897
Now working on raw data 9
Obtained rows 23112
Now working on raw data 10
Obtained rows 29045
Now working on raw data 11
Obtained rows 22334
Now working on raw data 12
Obtained rows 24252
Now working on raw data 13
Obtained rows 27583
Now working on raw data 14
Obtained rows 27346
Now working on raw data 15
Obtained rows 26625
Now working on raw data 16
Obtained rows 27286
Now working on raw data 17
Obtained rows 24636
Now working on raw data 18
Obtained rows 25384
Now working on raw data 19
Obtained rows 26310
Now working on raw data 20
Obtained rows 25496
Now working on raw data 21
Obtained rows 25791
Now working on raw dat

In [5]:
from pyspark.sql.functions import col
from pyspark.sql.functions import to_date
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
merged_data.printSchema()


root
 |-- Date Announced: string (nullable = true)
 |-- Detected State: string (nullable = true)
 |-- Detected District: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age Bracket: string (nullable = true)
 |-- Detected City: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Current Status: string (nullable = true)
 |-- Status Change Date: string (nullable = true)
 |-- Num Cases: string (nullable = true)



In [6]:
merged_data_1=merged_data.select([to_date("Date Announced","dd/MM/yyyy").alias('Date'),
                                  col('Detected State').alias('State'),
                                  col('Detected District').alias('District'),
                                  'Gender',
                                  col('Age Bracket').alias('Age').cast(IntegerType()),
                                  col('Detected City').alias('City'),
                                  'Nationality',
                                  col('Current Status').alias('Status'),
                                  to_date('Status Change Date',"dd/MM/yyyy").alias('Status_Date'),
                                  col('Num Cases').alias('cases').cast(IntegerType())])


In [7]:
merged_data_1.show()

+----------+---------+----------+------+----+--------------------+-----------+------------+-----------+-----+
|      Date|    State|  District|Gender| Age|                City|Nationality|      Status|Status_Date|cases|
+----------+---------+----------+------+----+--------------------+-----------+------------+-----------+-----+
|2020-01-30|   Kerala|  Thrissur|     F|  20|            Thrissur|      India|   Recovered| 2020-02-14|    1|
|2020-02-02|   Kerala| Alappuzha|  null|null|           Alappuzha|      India|   Recovered| 2020-02-14|    1|
|2020-02-03|   Kerala| Kasaragod|  null|null|           Kasaragod|      India|   Recovered| 2020-02-14|    1|
|2020-03-02|    Delhi|East Delhi|     M|  45|East Delhi (Mayur...|      India|   Recovered| 2020-03-15|    1|
|2020-03-02|Telangana| Hyderabad|     M|  24|           Hyderabad|      India|   Recovered| 2020-03-02|    1|
|2020-03-03|Rajasthan|  Italians|     M|  69|              Jaipur|      Italy|   Recovered| 2020-03-03|    1|
|2020-03-0

In [17]:
merged_data_1.printSchema()

root
 |-- Date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status_Date: date (nullable = true)
 |-- cases: integer (nullable = true)



In [31]:
# Shall we handle data merge after Pivot or before pivoting??
import pyspark.sql.functions as fns
pivoted_data=merged_data_1.withColumn('Status',fns.lower(col('Status'))).groupBy(["Date","State","District"]).pivot("Status").agg({"Cases":'sum'})

#merged_data_1.coalesce(24)

In [32]:

"""

root
 |-- Effective_date: string (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Deceased: integer (nullable = true)
 |-- Tested: integer (nullable = true)

"""
pivoted_data.printSchema()



root
 |-- Date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- null: long (nullable = true)
 |-- deceased: long (nullable = true)
 |-- hospitalized: long (nullable = true)
 |-- migrated: long (nullable = true)
 |-- migrated_other: long (nullable = true)
 |-- recovered: long (nullable = true)



In [33]:
pivoted_data.show()

+----------+-----------------+-------------------+----+--------+------------+--------+--------------+---------+
|      Date|            State|           District|null|deceased|hospitalized|migrated|migrated_other|recovered|
+----------+-----------------+-------------------+----+--------+------------+--------+--------------+---------+
|2020-05-01|Jammu and Kashmir|          Baramulla|null|    null|           1|    null|          null|        5|
|2020-06-25|    Uttar Pradesh|             Hardoi|null|    null|        null|    null|          null|        5|
|2020-08-01|   Andhra Pradesh|            Kurnool|null|       6|        1234|    null|          null|     1217|
|2020-10-17|    Uttar Pradesh|          Kaushambi|null|    null|          16|    null|          null|       12|
|2020-10-25|      Uttarakhand|           Haridwar|null|    null|          30|    null|             1|       26|
|2020-11-15|   Madhya Pradesh|        Hoshangabad|null|       1|          17|    null|          null|   

In [53]:
districts_df = pivoted_data.select(col('Date').alias('Effective_date'),
                                   col('State'),
                                   col('District'),
                                   col('hospitalized').alias('Confirmed'),
                                   col('recovered').alias('Recovered'),
                                   col('deceased').alias('Deceased'))

# we are removing those lines where none of the data is numeric - this cleans the entire data set as well
# we can use filter or where either
districts_df = districts_df.filter(col('Confirmed').isNotNull() | col('Recovered').isNotNull() | col('Deceased').isNotNull())

In [54]:
districts_df.printSchema()

root
 |-- Effective_date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Confirmed: long (nullable = true)
 |-- Recovered: long (nullable = true)
 |-- Deceased: long (nullable = true)



In [55]:
districts_df.summary().show()

+-------+--------------------+-------------+------------------+----------------+------------------+
|summary|               State|     District|         Confirmed|       Recovered|          Deceased|
+-------+--------------------+-------------+------------------+----------------+------------------+
|  count|              221253|       218366|            207149|          172788|             56320|
|   mean|                null|         null|100.19934443323405|98.3832962937241|       4.013671875|
| stddev|                null|         null| 468.4390237370644|446.040692572256|11.878261039782648|
|    min|Andaman and Nicob...| Kamrup Rural|            -12822|          -11132|              -275|
|    25%|                null|         null|                 5|               5|                 1|
|    50%|                null|         null|                17|              17|                 2|
|    75%|                null|         null|                61|              61|                 3|


In [58]:
districts_df.select(["State","Confirmed","Recovered","Deceased"]).where(col("State").isNotNull()).groupBy("State").agg({'Confirmed':'sum','Recovered':'sum','Deceased':'sum'}).show(40,False)

+----------------------------------------+--------------+-------------+--------------+
|State                                   |sum(Recovered)|sum(Deceased)|sum(Confirmed)|
+----------------------------------------+--------------+-------------+--------------+
|Nagaland                                |12357         |115          |14717         |
|Karnataka                               |1209892       |16530        |1690862       |
|Odisha                                  |416368        |2157         |489640        |
|Kerala                                  |1362079       |5565         |1743872       |
|Dadra and Nagar Haveli and Daman and Diu|6541          |4            |8345          |
|Ladakh                                  |13019         |151          |14560         |
|State Unassigned                        |null          |null         |0             |
|Tamil Nadu                              |1108436       |14589        |1249285       |
|Chhattisgarh                            |6

In [40]:
districts_df.write.partitionBy("State") \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("merged_and_clean_raw")

KeyboardInterrupt: 

In [35]:
def duplicate_col_remover(df):
    unique_cols = []
    releated_cols = []    
    for i in range(len(df.columns)):
        if df.columns[i] not in unique_cols:
            unique_cols.append(df.columns[i])
        else:
            releated_cols.append(i)
    col_set=[]
    for i in range(len(df.columns)):
        col_set.append(str(i))

    df = df.toDF(*col_set)
    for dupcol in releated_cols:
        df = df.drop(str(dupcol))
    return df.toDF(*unique_cols)

d1 = spark.createDataFrame([{"a":12121,"b":24342,"c":3534},{"a":121,"b":242,"c":35},{"a":1121,"b":2432,"c":353},{"a":121,"b":242,"c":34}])
d2 =  spark.createDataFrame([{"d":1111,"b":24342,"c":3534},{"d":555,"b":242,"c":35},{"d":1343,"b":2432,"c":353},{"d":434,"b":43,"c":34}])

joint_housing_table  = d1.join(d2, (d1.b == d2.b) & (d1.c == d2.c),"inner")

duplicate_col_remover(joint_housing_table).show()

# joint_housing_table[0]

+-----+-----+----+----+
|    a|    b|   c|   d|
+-----+-----+----+----+
|  121|  242|  35| 555|
| 1121| 2432| 353|1343|
|12121|24342|3534|1111|
+-----+-----+----+----+



In [36]:
merged_data.printSchema()

root
 |-- Date Announced: string (nullable = true)
 |-- Detected State: string (nullable = true)
 |-- Detected District: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age Bracket: string (nullable = true)
 |-- Detected City: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Current Status: string (nullable = true)
 |-- Status Change Date: string (nullable = true)



In [40]:
merged_data.select(['Detected State','Detected District']).groupBy('Detected State').agg({'Detected District':'count'}).show()

+--------------------+------------------------+
|      Detected State|count(Detected District)|
+--------------------+------------------------+
|               63855|                       0|
|            Nagaland|                    2063|
|           Karnataka|                  123855|
|              Odisha|                   20261|
|              Kerala|                   17070|
|              Ladakh|                    1202|
|Dadra and Nagar H...|                    1417|
|    State Unassigned|                      63|
|               63863|                       0|
|          Tamil Nadu|                   52355|
|               63851|                       0|
|               63817|                       0|
|        Chhattisgarh|                   19042|
|      Andhra Pradesh|                   12952|
|         Lakshadweep|                     202|
|      Madhya Pradesh|                   35712|
|              Punjab|                   16988|
|        Using RT-PCR|                  

In [24]:
f_now='https://api.covid19india.org/csv/latest/case_time_series.csv'
spark.sparkContext.addFile(f_now)
print("File loaded, now adding in Dataframe...")    
case_ts_data = spark.read.option("header", "true").csv(SparkFiles.get("case_time_series.csv"))

File loaded, now adding in Dataframe...


In [30]:
case_ts_data.show()

+----------------+----------+---------------+---------------+---------------+---------------+--------------+--------------+
|            Date|  Date_YMD|Daily Confirmed|Total Confirmed|Daily Recovered|Total Recovered|Daily Deceased|Total Deceased|
+----------------+----------+---------------+---------------+---------------+---------------+--------------+--------------+
| 30 January 2020|2020-01-30|              1|              1|              0|              0|             0|             0|
| 31 January 2020|2020-01-31|              0|              1|              0|              0|             0|             0|
| 1 February 2020|2020-02-01|              0|              1|              0|              0|             0|             0|
| 2 February 2020|2020-02-02|              1|              2|              0|              0|             0|             0|
| 3 February 2020|2020-02-03|              1|              3|              0|              0|             0|             0|
| 4 Febr

In [12]:
districts_data= (spark.read   
                .option("sep", ",").option("header",True)
                .csv('./Data/districts.csv'))


# districts_data = sc.textFile('./Data/districts.csv')
# # districts_data = sc.parallelize(districts_data)

In [16]:
districts_data.show(5,False)


+----------+---------------------------+-------------+---------+---------+--------+-----+------+
|Date      |State                      |District     |Confirmed|Recovered|Deceased|Other|Tested|
+----------+---------------------------+-------------+---------+---------+--------+-----+------+
|2020-04-26|Andaman and Nicobar Islands|Unknown      |33       |11       |0       |0    |2679  |
|2020-04-26|Andhra Pradesh             |Anantapur    |53       |14       |4       |0    |null  |
|2020-04-26|Andhra Pradesh             |Chittoor     |73       |13       |0       |0    |null  |
|2020-04-26|Andhra Pradesh             |East Godavari|39       |12       |0       |0    |null  |
|2020-04-26|Andhra Pradesh             |Guntur       |214      |29       |8       |0    |null  |
+----------+---------------------------+-------------+---------+---------+--------+-----+------+
only showing top 5 rows



In [17]:
districts_data.printSchema()
# districts_data = districts_data.map(lambda l:l.split(','))

root
 |-- Date: string (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Confirmed: string (nullable = true)
 |-- Recovered: string (nullable = true)
 |-- Deceased: string (nullable = true)
 |-- Other: string (nullable = true)
 |-- Tested: string (nullable = true)



In [21]:
districts_data.describe().show()

+-------+----------+--------------------+----------+------------------+------------------+------------------+-----------------+------------------+
|summary|      Date|               State|  District|         Confirmed|         Recovered|          Deceased|            Other|            Tested|
+-------+----------+--------------------+----------+------------------+------------------+------------------+-----------------+------------------+
|  count|    229783|              229783|    229783|            229783|            229783|            229783|           229783|            167134|
|   mean|      null|                null|      null|10142.690882267183| 9305.670776341156|149.81697079418407|3.199187929481293|159733.46884535762|
| stddev|      null|                null|      null|33782.449718313896|31427.514838583946| 629.6567303101169|41.49328159948047| 559857.5480608118|
|    min|2020-04-26|Andaman and Nicob...|Agar Malwa|                -1|                -1|                -1|         

In [5]:
districts_data.unique('District')

NameError: name 'districts_data' is not defined

In [17]:
import requests
import io

In [18]:
stock_csv_req=requests.get(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))
# stock_csv_req.text


In [21]:
# print(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))
#Load df_stock
# spark.sparkContext.addFile(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))

df_stock = spark.read.option("header", "true").format("csv").csv(io.StringIO(stock_csv_req.text))

df_stock.show()

Py4JJavaError: An error occurred while calling o256.load.
: java.lang.ClassCastException: class java.util.ArrayList cannot be cast to class java.lang.String (java.util.ArrayList and java.lang.String are in module java.base of loader 'bootstrap')
	at org.apache.spark.sql.internal.SessionState.$anonfun$newHadoopConfWithOptions$1(SessionState.scala:102)
	at org.apache.spark.sql.internal.SessionState.$anonfun$newHadoopConfWithOptions$1$adapted(SessionState.scala:102)
	at scala.collection.immutable.Map$Map2.foreach(Map.scala:159)
	at org.apache.spark.sql.internal.SessionState.newHadoopConfWithOptions(SessionState.scala:102)
	at org.apache.spark.sql.execution.datasources.DataSource.newHadoopConfiguration(DataSource.scala:114)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:375)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:326)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:308)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:308)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)


In [4]:
!ls /private/var/folders/3y/b7xzrwss1wg2drn7pkcst00m0000gn/T/spark-b44425e1-fa03-4e51-bf95-ff5fe3616da4/userFiles-1f24d8e8-41d9-47b0-adcd-0040e75b8faa/

ls: /private/var/folders/3y/b7xzrwss1wg2drn7pkcst00m0000gn/T/spark-b44425e1-fa03-4e51-bf95-ff5fe3616da4/userFiles-1f24d8e8-41d9-47b0-adcd-0040e75b8faa/: No such file or directory
