#Problem for Covid - 19 Data Analysis Project using Python

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Dataset link :  
Url = https://raw.githubusercontent.com/SR1608/Datasets/main/coviddata.csv

1. Import the dataset using Pandas from above mentioned url.

In [3]:
url = "https://raw.githubusercontent.com/SR1608/Datasets/main/covid-data.csv"
df = pd.read_csv(url)

2. High Level Data Understanding

In [None]:
# a. Find no. of rows & columns in the dataset
print("Number of rows and columns:", df.shape)

In [None]:
# b. Data types of columns
print("\nData types of columns:\n", df.dtypes)

In [None]:
# c. Info & describe of data in dataframe
print("\nDataframe Info:\n", df.info())
print("\nDataframe Describe:\n", df.describe())

3. Low Level Data Understanding:

In [None]:
# a. Find count of unique values in location column
print("\nCount of unique values in 'location' column:", df['location'].nunique())

In [None]:
# b. Find which continent has maximum frequency using value counts
max_continent = df['continent'].value_counts().idxmax()
print("\nContinent with maximum frequency:", max_continent)

In [None]:
# c. Find maximum & mean value in 'total_cases'
max_total_cases = df['total_cases'].max()
mean_total_cases = df['total_cases'].mean()
print("\nMaximum total cases:", max_total_cases)
print("Mean total cases:", mean_total_cases)

In [None]:
# d. Find 25%, 50%, and 75% quartile value in 'total_deaths'
quartiles_total_deaths = df['total_deaths'].quantile([0.25, 0.50, 0.75])
print("\nQuartile values for 'total_deaths':\n", quartiles_total_deaths)

In [None]:
# e. Find which continent has maximum 'human_development_index'
max_hdi_continent = df.loc[df['human_development_index'].idxmax()]['continent']
print("\nContinent with maximum 'human_development_index':", max_hdi_continent)

In [None]:
# f. Find which continent has minimum 'gdp_per_capita'
min_gdp_continent = df.loc[df['gdp_per_capita'].idxmin()]['continent']
print("Continent with minimum 'gdp_per_capita':", min_gdp_continent)

4. Filter the dataframe with only this columns ['continent', 'location', 'date', 'total_cases', 'total_deaths', 'gdp_per_capita', 'human_development_index'] and update the data frame.

In [22]:
selected_columns = ['continent', 'location', 'date', 'total_cases', 'total_deaths', 'gdp_per_capita', 'human_development_index']
df_filtered = df[selected_columns]

5. Data Cleaning

In [None]:
# a. Remove duplicates
df_filtered = df_filtered.drop_duplicates()

# b. Find missing values
missing_values = df_filtered.isnull().sum()
print("\nMissing values in columns:\n", missing_values)

# c. Remove observations with missing continent values
df_filtered = df_filtered.dropna(subset=['continent'])

# d. Fill missing values with 0
df_filtered = df_filtered.fillna(0)

6. Date time format

In [26]:
# a. Convert date column to datetime format
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

# b. Create new column 'month'
df_filtered['month'] = df_filtered['date'].dt.month

7. Data Aggregation

In [None]:
# a. Find max value in all columns using groupby function on 'continent' column
df_groupby = df.groupby('continent').max().reset_index()

# b. Display the resulting dataframe 'df_groupby'
print("\nDataframe after aggregation:\n", df_groupby)

8. Feature Engineering

In [30]:
# a. Create a new feature 'total_deaths_to_total_cases' by ratio of 'total_deaths' column to 'total_cases'
df_groupby['total_deaths_to_total_cases'] = df_groupby['total_deaths'] / df_groupby['total_cases']

9. Data Visualization

In [None]:
# a. Perform Univariate analysis on 'gdp_per_capita' column by plotting histogram using seaborn dist plot.
sns.distplot(df_groupby['gdp_per_capita'], kde=False)
plt.title('Histogram of GDP per Capita')
plt.show()

In [None]:
# b. Scatter plot of 'total_cases' & 'gdp_per_capita'
sns.scatterplot(x='gdp_per_capita', y='total_cases', data=df_groupby)
plt.title('Scatter Plot of Total Cases vs GDP per Capita')
plt.show()

In [None]:
# c. Pairplot on df_groupby dataset
sns.pairplot(df_groupby)
plt.suptitle('Pairplot of df_groupby Dataset')
plt.show()

In [None]:
# d. Bar plot of 'continent' column with 'total_cases'
sns.catplot(x='continent', y='total_cases', kind='bar', data=df_groupby)
plt.title('Bar Plot of Total Cases by Continent')
plt.show()

10. Save the df_groupby dataframe in your local drive using pandas.to_csv         function .

In [None]:
df_groupby.to_csv('df_groupby.csv', index=False)