In [1]:
!/usr/local/Cellar/jupyterlab/3.0.14/libexec/bin/pip3 install pyspark

You should consider upgrading via the '/usr/local/Cellar/jupyterlab/3.0.14/libexec/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [2]:
!which python3

/usr/local/bin/python3


In [3]:
from pyspark import SparkConf, SparkContext, SparkFiles
from pyspark.sql import *


config_prefix = "ampba.batch15.bdm2.group_assignment"

spark = SparkSession.builder \
.master("local") \
.appName("BDM2-GroupAssignment") \
.config(config_prefix+".raw_covid_datasource", "https://api.covid19india.org/csv/latest/raw_data{file_index}.csv") \
.config(config_prefix+".raw_covid_datasource_file_upto", 27) \
.config(config_prefix+".covid_datasource", "https://api.covid19india.org/csv/latest/districts.csv") \
.config(config_prefix+".stock_datasource", "https://docs.google.com/spreadsheets/d/1sNXNbIrOSU6jdJwFHOY6FGGM8PsJrNdkWf_70WOPtjc/export?format=csv") \
.getOrCreate()


In [4]:
spark

In [5]:
# spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource_file_upto')

#Reading each RAW file - and saving in RAW_DATA_DF
raw_data_stores=[]
def fetch_and_load_raw_data(f_now,file_index):
    print("Fetching data from",f_now,"...")
    spark.sparkContext.addFile(f_now)
    print("File loaded, now adding in Dataframe...")    
    return spark.read.option("header", "true").csv(SparkFiles.get("raw_data{file_index}.csv".format(file_index=file_index)))

print("Fetching first raw data file")

f_now = spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource').format(file_index=1)
# print(f_now)
raw_data_stores.append(fetch_and_load_raw_data(f_now,1))

for file_index in range(2, int(spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource_file_upto'))):
    f_now = spark.sparkContext.getConf().get(config_prefix+'.raw_covid_datasource').format(file_index=file_index)  
    raw_data_stores.append(fetch_and_load_raw_data(f_now,file_index))

Fetching first raw data file
Fetching data from https://api.covid19india.org/csv/latest/raw_data1.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data2.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data3.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data4.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data5.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data6.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data7.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covid19india.org/csv/latest/raw_data8.csv ...
File loaded, now adding in Dataframe...
Fetching data from https://api.covi

In [6]:
# raw_data_stores[0].printSchema()

cols_of_interest=['Date Announced','Detected State','Detected District','Gender','Age Bracket','Detected City','Nationality','Current Status','Status Change Date','Num Cases']

merged_data =raw_data_stores[0].select(cols_of_interest)
for i in range(1,len(raw_data_stores)):
    print("Now working on raw data",i)
    df = raw_data_stores[i].select(cols_of_interest)
    print("Obtained rows",df.count())
    merged_data=merged_data.union(df)

Now working on raw data 1
Obtained rows 10819
Now working on raw data 2
Obtained rows 10020
Now working on raw data 3
Obtained rows 18231
Now working on raw data 4
Obtained rows 20488
Now working on raw data 5
Obtained rows 23423
Now working on raw data 6
Obtained rows 22770
Now working on raw data 7
Obtained rows 22808
Now working on raw data 8
Obtained rows 26897
Now working on raw data 9
Obtained rows 23112
Now working on raw data 10
Obtained rows 29045
Now working on raw data 11
Obtained rows 22334
Now working on raw data 12
Obtained rows 24252
Now working on raw data 13
Obtained rows 27583
Now working on raw data 14
Obtained rows 27346
Now working on raw data 15
Obtained rows 26625
Now working on raw data 16
Obtained rows 27286
Now working on raw data 17
Obtained rows 24636
Now working on raw data 18
Obtained rows 25384
Now working on raw data 19
Obtained rows 26310
Now working on raw data 20
Obtained rows 25496
Now working on raw data 21
Obtained rows 25791
Now working on raw dat

In [7]:
from pyspark.sql.functions import col
from pyspark.sql.functions import to_date
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
merged_data.printSchema()


root
 |-- Date Announced: string (nullable = true)
 |-- Detected State: string (nullable = true)
 |-- Detected District: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age Bracket: string (nullable = true)
 |-- Detected City: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Current Status: string (nullable = true)
 |-- Status Change Date: string (nullable = true)
 |-- Num Cases: string (nullable = true)



In [8]:
merged_data_1=merged_data.select([to_date("Date Announced","dd/MM/yyyy").alias('Date'),
                                  col('Detected State').alias('State'),
                                  col('Detected District').alias('District'),
                                  'Gender',
                                  col('Age Bracket').alias('Age').cast(IntegerType()),
                                  col('Detected City').alias('City'),
                                  'Nationality',
                                  col('Current Status').alias('Status'),
                                  to_date('Status Change Date',"dd/MM/yyyy").alias('Status_Date'),
                                  col('Num Cases').alias('cases').cast(IntegerType())])


In [9]:
merged_data_1.show()

+----------+---------+----------+------+----+--------------------+-----------+------------+-----------+-----+
|      Date|    State|  District|Gender| Age|                City|Nationality|      Status|Status_Date|cases|
+----------+---------+----------+------+----+--------------------+-----------+------------+-----------+-----+
|2020-01-30|   Kerala|  Thrissur|     F|  20|            Thrissur|      India|   Recovered| 2020-02-14|    1|
|2020-02-02|   Kerala| Alappuzha|  null|null|           Alappuzha|      India|   Recovered| 2020-02-14|    1|
|2020-02-03|   Kerala| Kasaragod|  null|null|           Kasaragod|      India|   Recovered| 2020-02-14|    1|
|2020-03-02|    Delhi|East Delhi|     M|  45|East Delhi (Mayur...|      India|   Recovered| 2020-03-15|    1|
|2020-03-02|Telangana| Hyderabad|     M|  24|           Hyderabad|      India|   Recovered| 2020-03-02|    1|
|2020-03-03|Rajasthan|  Italians|     M|  69|              Jaipur|      Italy|   Recovered| 2020-03-03|    1|
|2020-03-0

In [10]:
merged_data_1.printSchema()

root
 |-- Date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status_Date: date (nullable = true)
 |-- cases: integer (nullable = true)



In [11]:
# Shall we handle data merge after Pivot or before pivoting??
import pyspark.sql.functions as fns
pivoted_data=merged_data_1.withColumn('Status',fns.lower(col('Status'))).groupBy(["Date","State","District"]).pivot("Status").agg({"Cases":'sum'})

#merged_data_1.coalesce(24)

In [12]:

"""

root
 |-- Effective_date: string (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Deceased: integer (nullable = true)
 |-- Tested: integer (nullable = true)

"""
pivoted_data.printSchema()



root
 |-- Date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- null: long (nullable = true)
 |-- deceased: long (nullable = true)
 |-- hospitalized: long (nullable = true)
 |-- migrated: long (nullable = true)
 |-- migrated_other: long (nullable = true)
 |-- recovered: long (nullable = true)



In [13]:
pivoted_data.show()

+----------+-----------------+-------------------+----+--------+------------+--------+--------------+---------+
|      Date|            State|           District|null|deceased|hospitalized|migrated|migrated_other|recovered|
+----------+-----------------+-------------------+----+--------+------------+--------+--------------+---------+
|2020-05-01|Jammu and Kashmir|          Baramulla|null|    null|           1|    null|          null|        5|
|2020-06-25|    Uttar Pradesh|             Hardoi|null|    null|        null|    null|          null|        5|
|2020-08-01|   Andhra Pradesh|            Kurnool|null|       6|        1234|    null|          null|     1217|
|2020-10-17|    Uttar Pradesh|          Kaushambi|null|    null|          16|    null|          null|       12|
|2020-10-25|      Uttarakhand|           Haridwar|null|    null|          30|    null|             1|       26|
|2020-11-15|   Madhya Pradesh|        Hoshangabad|null|       1|          17|    null|          null|   

In [14]:
districts_df = pivoted_data.select(col('Date').alias('Effective_date'),
                                   col('State'),
                                   col('District'),
                                   col('hospitalized').alias('Confirmed'),
                                   col('recovered').alias('Recovered'),
                                   col('deceased').alias('Deceased'))

# we are removing those lines where none of the data is numeric - this cleans the entire data set as well
# we can use filter or where either
districts_df = districts_df.filter(col('Confirmed').isNotNull() | col('Recovered').isNotNull() | col('Deceased').isNotNull())

In [15]:
districts_df.printSchema()

root
 |-- Effective_date: date (nullable = true)
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Confirmed: long (nullable = true)
 |-- Recovered: long (nullable = true)
 |-- Deceased: long (nullable = true)



In [16]:
districts_df.summary().show()

+-------+--------------------+-------------+-----------------+------------------+-----------------+
|summary|               State|     District|        Confirmed|         Recovered|         Deceased|
+-------+--------------------+-------------+-----------------+------------------+-----------------+
|  count|              221103|       218220|           207002|            172683|            56275|
|   mean|                null|         null|99.79734012231766| 98.05392540087907|4.004193691692581|
| stddev|                null|         null|465.4096633602333|442.97970033075364|11.80311483244228|
|    min|Andaman and Nicob...| Kamrup Rural|           -12822|            -11132|             -275|
|    25%|                null|         null|                5|                 5|                1|
|    50%|                null|         null|               16|                17|                2|
|    75%|                null|         null|               61|                61|                3|


In [17]:
districts_df.select(["State","Confirmed","Recovered","Deceased"]).where(col("State").isNotNull()).groupBy("State").agg({'Confirmed':'sum','Recovered':'sum','Deceased':'sum'}).show(40,False)

+----------------------------------------+--------------+-------------+--------------+
|State                                   |sum(Recovered)|sum(Deceased)|sum(Confirmed)|
+----------------------------------------+--------------+-------------+--------------+
|Nagaland                                |12357         |115          |14717         |
|Karnataka                               |1209892       |16530        |1690862       |
|Odisha                                  |410192        |2141         |479751        |
|Kerala                                  |1338973       |5507         |1701919       |
|Dadra and Nagar Haveli and Daman and Diu|6541          |4            |8345          |
|Ladakh                                  |13019         |151          |14560         |
|State Unassigned                        |null          |null         |0             |
|Tamil Nadu                              |1108436       |14589        |1249285       |
|Chhattisgarh                            |6

In [18]:
districts_df.write.partitionBy("State") \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("merged_and_clean_raw")

AnalysisException: Can not create the managed table('`merged_and_clean_raw`'). The associated location('file:/Users/dks/AMPBA/Term2/BDM2/Group%20Project/spark-warehouse/merged_and_clean_raw') already exists.

In [None]:
def duplicate_col_remover(df):
    unique_cols = []
    releated_cols = []    
    for i in range(len(df.columns)):
        if df.columns[i] not in unique_cols:
            unique_cols.append(df.columns[i])
        else:
            releated_cols.append(i)
    col_set=[]
    for i in range(len(df.columns)):
        col_set.append(str(i))

    df = df.toDF(*col_set)
    for dupcol in releated_cols:
        df = df.drop(str(dupcol))
    return df.toDF(*unique_cols)

d1 = spark.createDataFrame([{"a":12121,"b":24342,"c":3534},{"a":121,"b":242,"c":35},{"a":1121,"b":2432,"c":353},{"a":121,"b":242,"c":34}])
d2 =  spark.createDataFrame([{"d":1111,"b":24342,"c":3534},{"d":555,"b":242,"c":35},{"d":1343,"b":2432,"c":353},{"d":434,"b":43,"c":34}])

joint_housing_table  = d1.join(d2, (d1.b == d2.b) & (d1.c == d2.c),"inner")

duplicate_col_remover(joint_housing_table).show()

# joint_housing_table[0]

In [None]:
merged_data.printSchema()

In [None]:
merged_data.select(['Detected State','Detected District']).groupBy('Detected State').agg({'Detected District':'count'}).show()

In [None]:
f_now='https://api.covid19india.org/csv/latest/case_time_series.csv'
spark.sparkContext.addFile(f_now)
print("File loaded, now adding in Dataframe...")    
case_ts_data = spark.read.option("header", "true").csv(SparkFiles.get("case_time_series.csv"))

In [None]:
case_ts_data.show()

In [None]:
districts_data= (spark.read   
                .option("sep", ",").option("header",True)
                .csv('./Data/districts.csv'))


# districts_data = sc.textFile('./Data/districts.csv')
# # districts_data = sc.parallelize(districts_data)

In [None]:
districts_data.show(5,False)


In [None]:
districts_data.printSchema()
# districts_data = districts_data.map(lambda l:l.split(','))

In [None]:
districts_data.describe().show()

In [None]:
districts_data.unique('District')

In [None]:
stock_csv_req=requests.get(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))
# stock_csv_req.text
import pandas as pd

# df_stock=spark.createDataFrame(pd.read_csv(io.StringIO(stock_csv_req.text)))
df_stock=spark.createDataFrame(pd.read_csv(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource')))

df_stock.summary().show()


In [None]:
# print(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))
#Load df_stock
# spark.sparkContext.addFile(spark.sparkContext.getConf().get(config_prefix+'.stock_datasource'))

df_stock = spark.read.option("header", "true").format("csv").csv()

df_stock.show()

In [None]:
!ls /private/var/folders/3y/b7xzrwss1wg2drn7pkcst00m0000gn/T/spark-b44425e1-fa03-4e51-bf95-ff5fe3616da4/userFiles-1f24d8e8-41d9-47b0-adcd-0040e75b8faa/