# Data Preprocessing

Preparing the data before building the model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Begin by looking at the source using dataframe

In [2]:
file_path = "./data/world_population_data.csv"
df = pd.read_csv(file_path)

df

Unnamed: 0,rank,cca3,country,continent,2023 population,2022 population,2020 population,2015 population,2010 population,2000 population,1990 population,1980 population,1970 population,area (km²),density (km²),growth rate,world percentage
0,1,IND,India,Asia,1428627663,1417173173,1396387127,1322866505,1240613620,1059633675,870452165,696828385,557501301,3287590.00,481,0.81%,17.85%
1,2,CHN,China,Asia,1425671352,1425887337,1424929781,1393715448,1348191368,1264099069,1153704252,982372466,822534450,9706961.00,151,-0.02%,17.81%
2,3,USA,United States,North America,339996563,338289857,335942003,324607776,311182845,282398554,248083732,223140018,200328340,9372610.00,37,0.50%,4.25%
3,4,IDN,Indonesia,Asia,277534122,275501339,271857970,259091970,244016173,214072421,182159874,148177096,115228394,1904569.00,148,0.74%,3.47%
4,5,PAK,Pakistan,Asia,240485658,235824862,227196741,210969298,194454498,154369924,115414069,80624057,59290872,881912.00,312,1.98%,3.00%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,MSR,Montserrat,North America,4386,4390,4500,5059,4938,5138,10805,11452,11402,102.00,43,-0.09%,0.00%
230,231,FLK,Falkland Islands,South America,3791,3780,3747,3408,3187,3080,2332,2240,2274,12173.00,0,0.29%,0.00%
231,232,NIU,Niue,Oceania,1935,1934,1942,1847,1812,2074,2533,3637,5185,261.00,7,0.05%,0.00%
232,233,TKL,Tokelau,Oceania,1893,1871,1827,1454,1367,1666,1669,1647,1714,12.00,189,1.18%,0.00%


Check missing value

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rank              234 non-null    int64  
 1   cca3              234 non-null    object 
 2   country           234 non-null    object 
 3   continent         234 non-null    object 
 4   2023 population   234 non-null    int64  
 5   2022 population   234 non-null    int64  
 6   2020 population   234 non-null    int64  
 7   2015 population   234 non-null    int64  
 8   2010 population   234 non-null    int64  
 9   2000 population   234 non-null    int64  
 10  1990 population   234 non-null    int64  
 11  1980 population   234 non-null    int64  
 12  1970 population   234 non-null    int64  
 13  area (km²)        234 non-null    float64
 14  density (km²)     234 non-null    int64  
 15  growth rate       234 non-null    object 
 16  world percentage  234 non-null    object 
dt

In [4]:
df.describe()

Unnamed: 0,rank,2023 population,2022 population,2020 population,2015 population,2010 population,2000 population,1990 population,1980 population,1970 population,area (km²),density (km²)
count,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0
mean,117.5,34374420.0,34074410.0,33501070.0,31729960.0,29845240.0,26269470.0,22710220.0,18984620.0,15786910.0,581450.0,451.282051
std,67.694165,137386400.0,136766400.0,135589900.0,130405000.0,124218500.0,111698200.0,97832170.0,81785190.0,67795090.0,1761841.0,1979.398922
min,1.0,518.0,510.0,520.0,564.0,596.0,651.0,700.0,733.0,752.0,0.44,0.0
25%,59.25,422598.2,419738.5,415284.5,404676.0,393149.0,327242.0,264115.8,229614.2,155997.0,2650.0,39.5
50%,117.5,5643895.0,5559944.0,5493074.0,5307400.0,4942770.0,4292907.0,3825410.0,3141146.0,2604830.0,81199.5,97.5
75%,175.75,23245370.0,22476500.0,21447980.0,19730850.0,19159570.0,15762300.0,11869230.0,9826054.0,8817329.0,430425.8,242.75
max,234.0,1428628000.0,1425887000.0,1424930000.0,1393715000.0,1348191000.0,1264099000.0,1153704000.0,982372500.0,822534400.0,17098240.0,21403.0


In [5]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display the result
print("Missing Values:\n", missing_values)

Missing Values:
 rank                0
cca3                0
country             0
continent           0
2023 population     0
2022 population     0
2020 population     0
2015 population     0
2010 population     0
2000 population     0
1990 population     0
1980 population     0
1970 population     0
area (km²)          0
density (km²)       0
growth rate         0
world percentage    0
dtype: int64


No missing value detected

## Exploratory Data Analysis (EDA):

In [6]:
# Calculate total population in 2023
total_population_2023 = df['2023 population'].sum()
print(total_population_2023)

8043615390


## Feature Engineering


Encoding Categorical Variables

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variable 'continent'
le = LabelEncoder()
df['continent_encoded'] = le.fit_transform(df['continent'])

Encoded Continents as numeric ranged from 1 to 5

In [9]:
df

Unnamed: 0,rank,cca3,country,continent,2023 population,2022 population,2020 population,2015 population,2010 population,2000 population,1990 population,1980 population,1970 population,area (km²),density (km²),growth rate,world percentage,continent_encoded
0,1,IND,India,Asia,1428627663,1417173173,1396387127,1322866505,1240613620,1059633675,870452165,696828385,557501301,3287590.00,481,0.81%,17.85%,1
1,2,CHN,China,Asia,1425671352,1425887337,1424929781,1393715448,1348191368,1264099069,1153704252,982372466,822534450,9706961.00,151,-0.02%,17.81%,1
2,3,USA,United States,North America,339996563,338289857,335942003,324607776,311182845,282398554,248083732,223140018,200328340,9372610.00,37,0.50%,4.25%,3
3,4,IDN,Indonesia,Asia,277534122,275501339,271857970,259091970,244016173,214072421,182159874,148177096,115228394,1904569.00,148,0.74%,3.47%,1
4,5,PAK,Pakistan,Asia,240485658,235824862,227196741,210969298,194454498,154369924,115414069,80624057,59290872,881912.00,312,1.98%,3.00%,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,MSR,Montserrat,North America,4386,4390,4500,5059,4938,5138,10805,11452,11402,102.00,43,-0.09%,0.00%,3
230,231,FLK,Falkland Islands,South America,3791,3780,3747,3408,3187,3080,2332,2240,2274,12173.00,0,0.29%,0.00%,5
231,232,NIU,Niue,Oceania,1935,1934,1942,1847,1812,2074,2533,3637,5185,261.00,7,0.05%,0.00%,4
232,233,TKL,Tokelau,Oceania,1893,1871,1827,1454,1367,1666,1669,1647,1714,12.00,189,1.18%,0.00%,4
