In [1]:
import os
import findspark

SPARK_HOME = os.getenv("SPARK_HOME")
findspark.init(SPARK_HOME)

import pyspark
# from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as Fsum

import datetime

import glob
import numpy as np
import pandas as pd
from datetime import datetime 
%matplotlib inline
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import requests

import numpy as np
import re


url = "https://www.worldometers.info/coronavirus/"

source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')


In [2]:
table = soup.find("table", attrs={'id':'main_table_countries_today'})
table_head = table.thead.find_all('tr')
table_data = table.tbody.find_all('tr')

headings = []

for th in table_head[0].find_all("th"):
    a = th.text.replace('\n', ' ').strip()
    headings.append(a)

    print(a)



#
Country,Other
TotalCases
NewCases
TotalDeaths
NewDeaths
TotalRecovered
ActiveCases
Serious,Critical
Tot Cases/1M pop
Deaths/1M pop
TotalTests
Tests/ 1M pop
Population
Continent


In [3]:
# now that we understand the data, we can structure the data

data = []
for tr in table_data:
    t_row = {}
    # Each table row is stored in the form of
    # t_row = {'Country/Other': '', 'TotalCases': '', 'NewCases': ''...}

    # find all td's in tr and zip it with headings

    for td, th in zip(tr.find_all("td"), headings): 
        t_row[th] = td.text.replace('\n', '').strip()
    data.append(t_row)

In [4]:
spark = SparkSession \
    .builder \
    .appName("Data wrangling with Spark SQL") \
    .getOrCreate()

In [5]:
# user_log = spark.read.format('json').load(filepath)
df = spark.createDataFrame(data)

df.createOrReplaceTempView("df")

In [6]:
spark.sql("DESCRIBE df").show()

+----------------+---------+-------+
|        col_name|data_type|comment|
+----------------+---------+-------+
|               #|   string|   null|
|     ActiveCases|   string|   null|
|       Continent|   string|   null|
|   Country,Other|   string|   null|
|   Deaths/1M pop|   string|   null|
|        NewCases|   string|   null|
|       NewDeaths|   string|   null|
|      Population|   string|   null|
|Serious,Critical|   string|   null|
|   Tests/ 1M pop|   string|   null|
|      TotalCases|   string|   null|
|     TotalDeaths|   string|   null|
|  TotalRecovered|   string|   null|
|      TotalTests|   string|   null|
|Tot Cases/1M pop|   string|   null|
+----------------+---------+-------+



In [7]:
df.toPandas().head(15)

Unnamed: 0,#,ActiveCases,Continent,"Country,Other",Deaths/1M pop,NewCases,NewDeaths,Population,"Serious,Critical",Tests/ 1M pop,TotalCases,TotalDeaths,TotalRecovered,TotalTests,Tot Cases/1M pop
0,,1189148.0,North America,North America,,2919.0,436.0,,18977,,1765720,108253,468319.0,,
1,,852862.0,Europe,Europe,,9689.0,158.0,,10926,,1840356,165956,821538.0,,
2,,300140.0,South America,South America,,438.0,10.0,,10583,,520526,26635,193751.0,,
3,,340050.0,Asia,Asia,,4549.0,50.0,,5007,,868007,25903,502054.0,,
4,,55062.0,Africa,Africa,,518.0,10.0,,287,,97508,3020,39426.0,,
5,,544.0,Australia/Oceania,Oceania,,2.0,,,10,,8688,121,8023.0,,
6,,55.0,,,,,,,4,,721,15,651.0,,
7,,2737861.0,All,World,42.3,18115.0,664.0,,45794,,5101526,329903,2033762.0,,654.0
8,1.0,1127286.0,North America,USA,287.0,316.0,5.0,330785250.0,17815,42849.0,1593039,94941,370812.0,14173807.0,4816.0
9,2.0,221774.0,Europe,Russia,21.0,8849.0,127.0,145927633.0,2300,53451.0,317554,3099,92681.0,7800000.0,2176.0


## Dropping unwanted columns
* When I created the dataframe, the columns sorted alphabetically, though I don't why
* I will use the heading variable to get the list of columns to display and order it as it should
* If not sorted, i could have used `df.select([column for column in` df.columns `if column not in drop_list])`

In [8]:
x = [0, 9, 10, 12]
drop_list = [headings[i] for i, v in enumerate(headings) if i in x]

try:
    df = df.select([column for column in headings if column not in drop_list])

    df =  df.withColumnRenamed(headings[1], "Country")
    df =  df.withColumnRenamed(headings[8], "Critical")

except Exception as e:
    print(e)


In [9]:
df.toPandas().head(5)

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Critical,TotalTests,Population,Continent
0,North America,1765720,2919,108253,436,468319,1189148,18977,,,North America
1,Europe,1840356,9689,165956,158,821538,852862,10926,,,Europe
2,South America,520526,438,26635,10,193751,300140,10583,,,South America
3,Asia,868007,4549,25903,50,502054,340050,5007,,,Asia
4,Africa,97508,518,3020,10,39426,55062,287,,,Africa


## Cleaning Data
* I used regex to remove all symbols except dot
* I used for loop to pass the column names
* I also changed the column datatypes


In [10]:
for column in df.columns:
    # Remove symbols for the dataframe
    df = df.withColumn(f'{column}', regexp_replace(f'{column}', r"[^\.a-zA-Z0-9_]", ''))

    if column == df.columns[0] or column == df.columns[-1]:
        continue

    # Convert datatype to Double
    df = df.withColumn(f"{column}", df[f"{column}"].cast(DoubleType()))

df = df.na.fill(0)    
df.createOrReplaceTempView("df")

In [11]:
spark.sql("DESCRIBE df").show()

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|       Country|   string|   null|
|    TotalCases|   double|   null|
|      NewCases|   double|   null|
|   TotalDeaths|   double|   null|
|     NewDeaths|   double|   null|
|TotalRecovered|   double|   null|
|   ActiveCases|   double|   null|
|      Critical|   double|   null|
|    TotalTests|   double|   null|
|    Population|   double|   null|
|     Continent|   string|   null|
+--------------+---------+-------+



## I created country and continent dataframe for the initial dataFrame

In [28]:
continent = spark.sql(f"""
        SELECT *
        FROM df
        WHERE country IN (SELECT country FROM df LIMIT 6)
            """)

continent.createOrReplaceTempView("continent")



country = spark.sql(f"""
        SELECT *
        FROM df
        WHERE country NOT IN (SELECT country FROM df LIMIT 8)
        ORDER BY TotalCases DESC
            """)

country.createOrReplaceTempView("country")

In [16]:
path = "Corona_data_cleaned.csv"
country.write.save(path, header=True, format="csv")

In [17]:
new = spark.read.csv(path, header=True)

In [19]:
country.toPandas().head(10)

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Critical,TotalTests,Population,Continent
0,USA,1593039.0,316.0,94941.0,5.0,370812.0,1127286.0,17815.0,14173807.0,330785250.0,NorthAmerica
1,Russia,317554.0,8849.0,3099.0,127.0,92681.0,221774.0,2300.0,7800000.0,145927633.0,Europe
2,Brazil,293357.0,0.0,18894.0,0.0,116683.0,157780.0,8318.0,735224.0,212389176.0,SouthAmerica
3,Spain,279524.0,0.0,27888.0,0.0,196958.0,54678.0,1152.0,3037840.0,46752802.0,Europe
4,UK,248293.0,0.0,35704.0,0.0,0.0,0.0,1559.0,2962227.0,67846185.0,Europe
5,Italy,227364.0,0.0,32330.0,0.0,132282.0,62752.0,676.0,3171719.0,60471440.0,Europe
6,France,181575.0,0.0,28132.0,0.0,63354.0,90089.0,1794.0,1384633.0,65257613.0,Europe
7,Germany,178531.0,0.0,8270.0,0.0,158000.0,12261.0,1045.0,3595059.0,83754315.0,Europe
8,Turkey,152587.0,0.0,4222.0,0.0,113987.0,34378.0,877.0,1696355.0,84235031.0,Asia
9,Iran,126949.0,0.0,7183.0,0.0,98808.0,20958.0,2735.0,731213.0,83868516.0,Asia


In [22]:
lower = udf(lambda x: x.lower())


In [29]:
spark.sql("""
        SELECT lower(Country) AS Country, *
        FROM continent
""").toPandas()

Unnamed: 0,Country,Country.1,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Critical,TotalTests,Population,Continent
0,northamerica,NorthAmerica,1765720.0,2919.0,108253.0,436.0,468319.0,1189148.0,18977.0,0.0,0.0,NorthAmerica
1,europe,Europe,1840356.0,9689.0,165956.0,158.0,821538.0,852862.0,10926.0,0.0,0.0,Europe
2,southamerica,SouthAmerica,520526.0,438.0,26635.0,10.0,193751.0,300140.0,10583.0,0.0,0.0,SouthAmerica
3,asia,Asia,868007.0,4549.0,25903.0,50.0,502054.0,340050.0,5007.0,0.0,0.0,Asia
4,africa,Africa,97508.0,518.0,3020.0,10.0,39426.0,55062.0,287.0,0.0,0.0,Africa
5,oceania,Oceania,8688.0,2.0,121.0,0.0,8023.0,544.0,10.0,0.0,0.0,AustraliaOceania


In [26]:
continent = continent.withColumn("Country", lower(continent["Country"]))

In [27]:
continent.toPandas().head(10)

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Critical,TotalTests,Population,Continent,hour
0,northamerica,1765720.0,2919.0,108253.0,436.0,468319.0,1189148.0,18977.0,0.0,0.0,NorthAmerica,northamerica
1,europe,1840356.0,9689.0,165956.0,158.0,821538.0,852862.0,10926.0,0.0,0.0,Europe,europe
2,southamerica,520526.0,438.0,26635.0,10.0,193751.0,300140.0,10583.0,0.0,0.0,SouthAmerica,southamerica
3,asia,868007.0,4549.0,25903.0,50.0,502054.0,340050.0,5007.0,0.0,0.0,Asia,asia
4,africa,97508.0,518.0,3020.0,10.0,39426.0,55062.0,287.0,0.0,0.0,Africa,africa
5,oceania,8688.0,2.0,121.0,0.0,8023.0,544.0,10.0,0.0,0.0,AustraliaOceania,oceania
