## Data Analysis of covid_cases.csv dataset

#### Libraries

In [1]:
# Libraries

import pyspark.pandas as ps
from pyspark.sql import SparkSession



#### Exploring the dataset

In [3]:
# Creating a spark session
spark = SparkSession.builder.appName('covid_cases_upd').getOrCreate()

# Reading the dataset with spark
dfs = spark.read.csv('./datasets/covid_cases_upd.csv', header=True, inferSchema=True)

# Printing the dataset size
print(f'Dataset size: \nRows: {dfs.count()} and Columns: {len(dfs.columns)}')

# Printing column names
print('\nColumn names:\n')
spark.createDataFrame(dfs.columns, 'string').show(len(dfs.columns), truncate=False)

# # Printing the schema
dfs.printSchema()

# # Printing the first 5 rows
dfs.show(5)

# Continenets
print('\nContinents:')
uni_continent = dfs.select('continent').distinct().rdd.flatMap(lambda x: x).filter(lambda x: x is not None).collect()
print(uni_continent)
print(f'Number of Continents: {len(uni_continent)}')

# Countries
print('\nCountries:')
uni_country = dfs.select('location').distinct().sort('location', ascending=True).rdd.flatMap(lambda x: x).filter(lambda x: x is not None and x not in uni_continent).collect()
print(uni_country)
print(f'Number of Countries: {len(uni_country)}\n')

# Printing the number of data points for each country
print('Number of data points for each country:')
dfs.groupBy('location').count().show()

# Printing the avarage number of data points for all countries
print('Avarage number of data points for all countries:')
dfs.groupBy('location').count().agg({'count': 'avg'}).show()

                                                                                

Dataset size: 
Rows: 349999 and Columns: 67

Column names:

+------------------------------------------+
|value                                     |
+------------------------------------------+
|iso_code                                  |
|continent                                 |
|location                                  |
|date                                      |
|total_cases                               |
|new_cases                                 |
|new_cases_smoothed                        |
|total_deaths                              |
|new_deaths                                |
|new_deaths_smoothed                       |
|total_cases_per_million                   |
|new_cases_per_million                     |
|new_cases_smoothed_per_million            |
|total_deaths_per_million                  |
|new_deaths_per_million                    |
|new_deaths_smoothed_per_million           |
|reproduction_rate                         |
|icu_patients                           