In [1]:
# Import all the important libraies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
# Load the data and overview it
df = pd.read_csv("country_wise_latest.csv")
df.head()


Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


The raw dataset contains country-level data with multiple numerical indicators.
Before analysis, column names, missing values, and data consistency must be checked.


In [4]:
# Standardize column names
df.columns = (
    df.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_")
)
df.head()


Unnamed: 0,country/region,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered,deaths_/_100_cases,recovered_/_100_cases,deaths_/_100_recovered,confirmed_last_week,1_week_change,1_week_%_increase,who_region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


In [6]:
# Check missing values
df.isnull().sum()

# Fill or handle missing values
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(0)


Missing numerical values are filled with 0 to ensure consistency
for aggregation and analysis.


In [7]:
# Remove duplicates
df = df.drop_duplicates()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country/region          187 non-null    object 
 1   confirmed               187 non-null    int64  
 2   deaths                  187 non-null    int64  
 3   recovered               187 non-null    int64  
 4   active                  187 non-null    int64  
 5   new_cases               187 non-null    int64  
 6   new_deaths              187 non-null    int64  
 7   new_recovered           187 non-null    int64  
 8   deaths_/_100_cases      187 non-null    float64
 9   recovered_/_100_cases   187 non-null    float64
 10  deaths_/_100_recovered  187 non-null    float64
 11  confirmed_last_week     187 non-null    int64  
 12  1_week_change           187 non-null    int64  
 13  1_week_%_increase       187 non-null    float64
 14  who_region              187 non-null    ob

In [9]:
# structured analytical view
clean_df = df[
    [
        "country/region",
        "who_region",
        "confirmed",
        "deaths",
        "recovered",
        "active",
        "new_cases",
        "new_deaths",
        "new_recovered"
    ]
]
clean_df.head()



Unnamed: 0,country/region,who_region,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
0,Afghanistan,Eastern Mediterranean,36263,1269,25198,9796,106,10,18
1,Albania,Europe,4880,144,2745,1991,117,6,63
2,Algeria,Africa,27973,1163,18837,7973,616,8,749
3,Andorra,Europe,907,52,803,52,10,0,0
4,Angola,Africa,950,41,242,667,18,1,0


In [10]:
# Summary statistics
clean_df.describe()


Unnamed: 0,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0
mean,88130.94,3497.518717,50631.48,34001.94,1222.957219,28.957219,933.812834
std,383318.7,14100.002482,190188.2,213326.2,5710.37479,120.037173,4197.719635
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1114.0,18.5,626.5,141.5,4.0,0.0,0.0
50%,5059.0,108.0,2815.0,1600.0,49.0,1.0,22.0
75%,40460.5,734.0,22606.0,9149.0,419.5,6.0,221.0
max,4290259.0,148011.0,1846641.0,2816444.0,56336.0,1076.0,33728.0


In [None]:
# Simple ranking
top_countries = clean_df.sort_values(
    by="confirmed",
    ascending=False
).head(10)

top_countries
#Save clean output
clean_df.to_csv("clean_data.csv", index=False) 



Unnamed: 0,country/region,who_region,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
0,Afghanistan,Eastern Mediterranean,36263,1269,25198,9796,106,10,18
1,Albania,Europe,4880,144,2745,1991,117,6,63
2,Algeria,Africa,27973,1163,18837,7973,616,8,749
3,Andorra,Europe,907,52,803,52,10,0,0
4,Angola,Africa,950,41,242,667,18,1,0
5,Antigua and Barbuda,Americas,86,3,65,18,4,0,5
6,Argentina,Americas,167416,3059,72575,91782,4890,120,2057
7,Armenia,Europe,37390,711,26665,10014,73,6,187
8,Australia,Western Pacific,15303,167,9311,5825,368,6,137
9,Austria,Europe,20558,713,18246,1599,86,1,37
