## Librerias

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession 
import logging
import os
import pyspark.sql.functions as F
import datetime, time
from pyspark.sql.functions import regexp_replace, to_date
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType, DateType   
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('TestEK')\
                .config('spark.master','local[*]')\
                .config('spark.shuffle.sql.partitions',1)\
                .getOrCreate()
sqlContext = SparkSession(spark)

In [3]:
## spark.stop()

#### initialize logger

In [4]:
logging.basicConfig(filename='CodePandas_EK.log',  
                    level=logging.INFO, 
                    format= '[%(asctime)s] - %(levelname)s - %(message)s',
                    datefmt='%H:%M:%S',
                    filemode='w')

## Cargar los datasets utilizando Spark y mantenerlos en formato parquet

#### Class definitions

In [5]:
class MyFile:
    def __init__(self, path, name):
        self.path = path
        self.name = name
        self.dfNews = [] 
        logging.info("*********************************************************")
        logging.info(f"Iniciando carga de {self.name} con Spark.")

    def readNews(self):
        self.dfNews = pd.read_csv("archive/"+self.name+".csv", sep = ',',header='infer')
        logging.info(f"El archivo nuevo tiene {len(self.dfNews)} registros.") 

    def updateSchema(self):
        pass

    def initialLoad(self):
        self.dfNews.to_parquet("parquet/pandas/"+self.name+".parquet")
        logging.info(f"Carga incial completada. Se cargaron {len(self.dfNews)} registros.")

    def incrementalLoad(self):
        df = pd.read_parquet("parquet/pandas/"+self.name+".parquet")
        logging.info(f"El archivo destino tiene {len(df)} registros.") 

        df2 = pd.concat([df, self.dfNews[~self.dfNews.index.isin(df.index)]])
        insertCount = len(df2)-len(df)

        df3 = self.dfNews.merge(df2, how='left', indicator=True) 
        dfToUpdate=df3.loc[df3['_merge'] == 'left_only'] 
        updateCount = len(dfToUpdate)
       
        df2.update(self.dfNews)

        df2.to_parquet("parquet/pandas/"+self.name+".parquet")
        logging.info(f"Se actualizaron {updateCount} registros y se insertaron {insertCount} nuevos.") 

###### CountryWiseLatest

In [6]:
class CountryWiseLatest(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name)

    def updateSchema(self):
        self.dfNews = self.dfNews.rename(columns={"Country/Region":"Country",
                            "New cases":"NewCases",
                            "New deaths":"NewDeaths",
                            "New recovered":"NewRecovered",
                            "Deaths / 100 Cases":"Deaths100Cases",
                            "Recovered / 100 Cases":"Recovered100Cases",
                            "Deaths / 100 Recovered":"Deaths100Recovered",
                            "Confirmed last week":"ConfirmedLastWeek",
                            "1 week change":"weekChange",
                            "1 week % increase":"weekIncrease",
                            "WHO Region":"WHORegion"
                           }).set_index(["Country","WHORegion"])

###### FullGrouped

In [7]:
class FullGrouped(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name)

    def updateSchema(self):
        self.dfNews["Date"] = pd.to_datetime(self.dfNews["Date"], format="%Y-%m-%d")
        
        self.dfNews = self.dfNews.rename(columns={"Country/Region":"Country",
                            "New cases":"NewCases",
                            "New deaths":"NewDeaths",
                            "New recovered":"NewRecovered", 
                            "WHO Region":"WHORegion"
                           }).set_index(["Date","Country","WHORegion"])

###### Covid19CleanComplete

In [8]:
class Covid19CleanComplete(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name)

    def updateSchema(self):
        self.dfNews["Date"] = pd.to_datetime(self.dfNews["Date"], format="%Y-%m-%d")
        
        self.dfNews = self.dfNews.rename(columns={"Province/State":"State",
                            "Country/Region":"Country", 
                            "WHO Region":"WHORegion"
                           }).set_index(["Date","WHORegion","Country","State"]) 

###### WorldometerData

In [9]:
class WorldometerData(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name) 

    def updateSchema(self):
        self.dfNews = self.dfNews.rename(columns={"Country/Region":"Country",
                            "Serious,Critical":"Serious",
                            "Tot Cases/1M pop":"TotCases1MPop",
                            "Deaths/1M pop":"Deaths1MPop",
                            "Tests/1M pop":"Tests1MPop",
                            "WHO Region":"WHORegion"
                           }).set_index(["WHORegion","Continent","Country"]) 

###### DayWise

In [10]:
class DayWise(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name)

    def updateSchema(self):
        self.dfNews["Date"] = pd.to_datetime(self.dfNews["Date"], format="%Y-%m-%d")
        
        self.dfNews = self.dfNews.rename(columns={"New cases":"NewCases",
                            "New deaths":"NewDeaths",
                            "New recovered":"NewRecovered",
                            "Deaths / 100 Cases":"Deaths100Cases",
                            "Recovered / 100 Cases":"Recovered100Cases",
                            "Deaths / 100 Recovered":"Deaths100Recovered",
                            "No. of countries":"CountriesNr"
                           }).set_index("Date")

###### UsaCountyWise

In [11]:
class UsaCountyWise(MyFile):
    def __init__(self,path, name):
        super().__init__(path, name)
     
    def updateSchema(self):
        self.dfNews["Date"] = pd.to_datetime(self.dfNews["Date"], format="%m/%d/%y")
        
        self.dfNews = self.dfNews.rename(columns={"Province_State":"State",
                            "Country_Region":"Country",
                            "Long_":"Long",
                            "Combined_Key":"CombinedKey"
                           }).set_index(["Date","State","Country","Admin2"])          

#### Main

In [12]:
path = "./archive/"
files = os.listdir(path)
files = list(map(lambda x: x[0:x.find(".")] ,filter(lambda f: f.endswith('.csv'), files)))

In [13]:
for x in files:
    if x == "country_wise_latest":
        file = CountryWiseLatest(path,x)
    elif x == "full_grouped":
        file = FullGrouped(path,x)
    elif x == "covid_19_clean_complete":
        file = Covid19CleanComplete(path,x)
    elif x == "day_wise":
        file = DayWise(path,x)
    elif x == "usa_county_wise":
        file = UsaCountyWise(path,x)
    elif x == "worldometer_data":
        file = WorldometerData(path,x)
    file.readNews()
    file.updateSchema()
    if x+".parquet" in os.listdir("./parquet/pandas"):
        file.incrementalLoad()
    else:
        file.initialLoad() 
    print(x)

country_wise_latest
covid_19_clean_complete
day_wise
full_grouped
usa_county_wise
worldometer_data


In [22]:
df =pd.read_parquet("parquet/pandas/full_grouped.parquet")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Confirmed,Deaths,Recovered,Active,NewCases,NewDeaths,NewRecovered
Date,Country,WHORegion,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-22,Afghanistan,Eastern Mediterranean,0,0,0,0,0,0,0
2020-01-22,Albania,Europe,0,0,0,0,0,0,0
2020-01-22,Algeria,Africa,0,0,0,0,0,0,0
2020-01-22,Andorra,Europe,0,0,0,0,0,0,0
2020-01-22,Angola,Africa,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2020-07-27,West Bank and Gaza,Eastern Mediterranean,10621,78,3752,6791,152,2,0
2020-07-27,Western Sahara,Africa,10,1,8,1,0,0,0
2020-07-27,Yemen,Eastern Mediterranean,1691,483,833,375,10,4,36
2020-07-27,Zambia,Africa,4552,140,2815,1597,71,1,465
