### Loading and inspecting dataset

In [2]:
#import pandas as pd
import pandas as pd

In [3]:
#read the csv file 
filepath = "DataFiles/global_unemployment_data.csv"
df = pd.read_csv(filepath)

In [4]:
#Load the first rows 
df.head()

Unnamed: 0,country_name,indicator_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,Unemployment rate by sex and age,Female,15-24,Youth,13.34,15.974,18.57,21.137,20.649,20.154,21.228,21.64,30.561,32.2,33.332
1,Afghanistan,Unemployment rate by sex and age,Female,25+,Adults,8.576,9.014,9.463,9.92,11.223,12.587,14.079,14.415,23.818,26.192,28.298
2,Afghanistan,Unemployment rate by sex and age,Female,Under 15,Children,10.306,11.552,12.789,14.017,14.706,15.418,16.783,17.134,26.746,29.193,30.956
3,Afghanistan,Unemployment rate by sex and age,Male,15-24,Youth,9.206,11.502,13.772,16.027,15.199,14.361,14.452,15.099,16.655,18.512,19.77
4,Afghanistan,Unemployment rate by sex and age,Male,25+,Adults,6.463,6.879,7.301,7.728,7.833,7.961,8.732,9.199,11.357,12.327,13.087


In [5]:
#check the number of rows and columns in our dataset
df.shape

(1134, 16)

In [6]:
#Check the column names
df.columns

Index(['country_name', 'indicator_name', 'sex', 'age_group', 'age_categories',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', '2024'],
      dtype='object')

In [7]:
#Check the values
df.values

array([['Afghanistan', 'Unemployment rate by sex and age', 'Female', ...,
        30.561, 32.2, 33.332],
       ['Afghanistan', 'Unemployment rate by sex and age', 'Female', ...,
        23.818, 26.192, 28.298],
       ['Afghanistan', 'Unemployment rate by sex and age', 'Female', ...,
        26.746, 29.193, 30.956],
       ...,
       ['Zimbabwe', 'Unemployment rate by sex and age', 'Male', ...,
        13.966, 13.675, 13.485],
       ['Zimbabwe', 'Unemployment rate by sex and age', 'Male', ...,
        6.858, 6.757, 6.703],
       ['Zimbabwe', 'Unemployment rate by sex and age', 'Male', ...,
        8.842, 8.713, 8.632]], dtype=object)

In [8]:
#statistical summary of the dataset
df.describe()

Unnamed: 0,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
count,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1128.0,1122.0,1122.0
mean,11.3878,11.272444,11.122963,10.863516,10.516499,10.311452,11.851285,11.422645,10.340361,9.985181,9.940089
std,11.119002,10.915942,10.742947,10.64098,10.527773,10.297952,11.23158,10.873412,10.26481,9.987778,9.977512
min,0.027,0.034,0.038,0.035,0.044,0.036,0.056,0.064,0.067,0.063,0.06
25%,3.9335,3.9935,3.94525,3.7475,3.67275,3.5385,4.3345,4.1535,3.55525,3.4775,3.45975
50%,7.6975,7.5475,7.5045,7.1405,6.706,6.6275,8.0675,7.5425,6.5715,6.466,6.364
75%,15.05075,14.76625,14.4675,14.142,13.343,13.2855,15.31625,14.8815,13.41,12.9145,12.68775
max,74.485,74.655,74.72,75.416,76.395,77.173,83.99,82.135,78.776,78.541,78.644


### Data Cleaning

In [10]:
#Checking for null values
df.isnull().sum()

country_name       0
indicator_name     0
sex                0
age_group          0
age_categories     0
2014               0
2015               0
2016               0
2017               0
2018               0
2019               0
2020               0
2021               0
2022               6
2023              12
2024              12
dtype: int64

In [11]:
#Drop null values
df= df.dropna()

In [12]:
df.isnull().sum()

country_name      0
indicator_name    0
sex               0
age_group         0
age_categories    0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
2024              0
dtype: int64

In [13]:
#Check for duplicates
df[df.duplicated()]

Unnamed: 0,country_name,indicator_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024


In [14]:
#Check data type
df.dtypes

country_name       object
indicator_name     object
sex                object
age_group          object
age_categories     object
2014              float64
2015              float64
2016              float64
2017              float64
2018              float64
2019              float64
2020              float64
2021              float64
2022              float64
2023              float64
2024              float64
dtype: object

In [15]:
#drop column
df=df.drop(columns = ["indicator_name"])

In [16]:
df

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,Female,15-24,Youth,13.340,15.974,18.570,21.137,20.649,20.154,21.228,21.640,30.561,32.200,33.332
1,Afghanistan,Female,25+,Adults,8.576,9.014,9.463,9.920,11.223,12.587,14.079,14.415,23.818,26.192,28.298
2,Afghanistan,Female,Under 15,Children,10.306,11.552,12.789,14.017,14.706,15.418,16.783,17.134,26.746,29.193,30.956
3,Afghanistan,Male,15-24,Youth,9.206,11.502,13.772,16.027,15.199,14.361,14.452,15.099,16.655,18.512,19.770
4,Afghanistan,Male,25+,Adults,6.463,6.879,7.301,7.728,7.833,7.961,8.732,9.199,11.357,12.327,13.087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,Zimbabwe,Female,25+,Adults,4.262,4.586,4.773,4.900,5.027,5.309,6.345,7.792,7.704,7.597,7.528
1130,Zimbabwe,Female,Under 15,Children,5.174,5.666,6.014,6.289,6.555,6.990,8.494,9.853,9.670,9.518,9.414
1131,Zimbabwe,Male,15-24,Youth,6.007,7.131,8.119,9.029,9.927,11.048,13.191,14.261,13.966,13.675,13.485
1132,Zimbabwe,Male,25+,Adults,3.677,4.325,4.843,5.287,5.722,6.362,6.996,7.234,6.858,6.757,6.703


In [17]:
#check outliers for each years(2014)
q3 = df["2014"].quantile(0.75)
q1 = df["2014"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

31.332125000000005
-12.516875000000002


In [18]:
#remove outliers
df = df[(df["2014"]>= lower_bound) & (df["2014"]<=upper_bound)]

In [19]:
#check outliers for each years(2014)
q3 = df["2015"].quantile(0.75)
q1 = df["2015"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

df = df[(df["2015"]>= lower_bound) & (df["2015"]<=upper_bound)]

25.384999999999998
-9.255


In [20]:
#check outliers for each years(2015)
q3 = df["2015"].quantile(0.75)
q1 = df["2015"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2015"]>= lower_bound) & (df["2015"]<=upper_bound)]

23.822625000000002
-8.400375


In [21]:
#check outliers for each years(2016)
q3 = df["2016"].quantile(0.75)
q1 = df["2016"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2016"]>= lower_bound) & (df["2016"]<=upper_bound)]

22.911500000000004
-7.940500000000002


In [22]:
#check outliers for each years(2017)
q3 = df["2017"].quantile(0.75)
q1 = df["2017"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2017"]>= lower_bound) & (df["2017"]<=upper_bound)]

21.8125
-7.495499999999999


In [23]:
#check outliers for each years(2018)
q3 = df["2018"].quantile(0.75)
q1 = df["2018"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2018"]>= lower_bound) & (df["2018"]<=upper_bound)]

20.560000000000002
-7.020000000000001


In [24]:
#check outliers for each years(2019)
q3 = df["2019"].quantile(0.75)
q1 = df["2019"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2019"]>= lower_bound) & (df["2019"]<=upper_bound)]

19.683
-6.593


In [25]:
#check outliers for each years(2020)
q3 = df["2020"].quantile(0.75)
q1 = df["2020"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2020"]>= lower_bound) & (df["2020"]<=upper_bound)]

22.311249999999994
-7.134749999999998


In [26]:
#check outliers for each years(2021)
q3 = df["2021"].quantile(0.75)
q1 = df["2021"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2021"]>= lower_bound) & (df["2021"]<=upper_bound)]

21.067
-6.533000000000001


In [27]:
#check outliers for each years(2022)
q3 = df["2022"].quantile(0.75)
q1 = df["2022"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2022"]>= lower_bound) & (df["2022"]<=upper_bound)]

18.24425
-5.80175


In [28]:
#check outliers for each years(2023)
q3 = df["2023"].quantile(0.75)
q1 = df["2023"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2023"]>= lower_bound) & (df["2023"]<=upper_bound)]

16.749499999999998
-5.1324999999999985


In [29]:
#check outliers for each years(2024)
q3 = df["2024"].quantile(0.75)
q1 = df["2024"].quantile(0.25)

iqr= q3-q1

upper_bound = q3 + 1.5 *iqr
lower_bound = q1 - 1.5 * iqr

print(upper_bound)
print(lower_bound)

#remove outliers
df = df[(df["2024"]>= lower_bound) & (df["2024"]<=upper_bound)]

16.2565
-4.845499999999999


In [30]:
#check the remaining number of rows
df.shape

(896, 15)

### Data Manipulation

In [32]:
#Filter country by Afghanistan
Afghanistan=df[df["country_name"]== "Afghanistan"]

In [33]:
Afghanistan

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
4,Afghanistan,Male,25+,Adults,6.463,6.879,7.301,7.728,7.833,7.961,8.732,9.199,11.357,12.327,13.087
5,Afghanistan,Male,Under 15,Children,7.34,8.355,9.362,10.36,10.137,9.93,10.458,10.97,13.17,14.411,15.296


In [34]:
#Filter country by Albanian
Albania=df[df["country_name"]== "Albania"]
Albania

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
7,Albania,Female,25+,Adults,13.6,14.499,12.142,10.875,10.312,9.662,10.743,10.736,10.073,9.66,9.488
8,Albania,Female,Under 15,Children,15.476,17.115,14.477,12.511,11.876,11.324,12.536,12.337,11.475,10.938,10.68
10,Albania,Male,25+,Adults,16.316,13.874,13.311,11.972,10.201,9.43,10.826,10.717,9.882,9.476,9.324
11,Albania,Male,Under 15,Children,19.853,17.245,16.148,14.452,12.625,11.584,13.065,12.791,11.751,11.198,10.951


In [35]:
df

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
4,Afghanistan,Male,25+,Adults,6.463,6.879,7.301,7.728,7.833,7.961,8.732,9.199,11.357,12.327,13.087
5,Afghanistan,Male,Under 15,Children,7.340,8.355,9.362,10.360,10.137,9.930,10.458,10.970,13.170,14.411,15.296
7,Albania,Female,25+,Adults,13.600,14.499,12.142,10.875,10.312,9.662,10.743,10.736,10.073,9.660,9.488
8,Albania,Female,Under 15,Children,15.476,17.115,14.477,12.511,11.876,11.324,12.536,12.337,11.475,10.938,10.680
10,Albania,Male,25+,Adults,16.316,13.874,13.311,11.972,10.201,9.430,10.826,10.717,9.882,9.476,9.324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,Zimbabwe,Female,25+,Adults,4.262,4.586,4.773,4.900,5.027,5.309,6.345,7.792,7.704,7.597,7.528
1130,Zimbabwe,Female,Under 15,Children,5.174,5.666,6.014,6.289,6.555,6.990,8.494,9.853,9.670,9.518,9.414
1131,Zimbabwe,Male,15-24,Youth,6.007,7.131,8.119,9.029,9.927,11.048,13.191,14.261,13.966,13.675,13.485
1132,Zimbabwe,Male,25+,Adults,3.677,4.325,4.843,5.287,5.722,6.362,6.996,7.234,6.858,6.757,6.703


In [72]:
df

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
4,Afghanistan,Male,25+,Adults,6.463,6.879,7.301,7.728,7.833,7.961,8.732,9.199,11.357,12.327,13.087
5,Afghanistan,Male,Under 15,Children,7.340,8.355,9.362,10.360,10.137,9.930,10.458,10.970,13.170,14.411,15.296
7,Albania,Female,25+,Adults,13.600,14.499,12.142,10.875,10.312,9.662,10.743,10.736,10.073,9.660,9.488
8,Albania,Female,Under 15,Children,15.476,17.115,14.477,12.511,11.876,11.324,12.536,12.337,11.475,10.938,10.680
10,Albania,Male,25+,Adults,16.316,13.874,13.311,11.972,10.201,9.430,10.826,10.717,9.882,9.476,9.324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,Zimbabwe,Female,25+,Adults,4.262,4.586,4.773,4.900,5.027,5.309,6.345,7.792,7.704,7.597,7.528
1130,Zimbabwe,Female,Under 15,Children,5.174,5.666,6.014,6.289,6.555,6.990,8.494,9.853,9.670,9.518,9.414
1131,Zimbabwe,Male,15-24,Youth,6.007,7.131,8.119,9.029,9.927,11.048,13.191,14.261,13.966,13.675,13.485
1132,Zimbabwe,Male,25+,Adults,3.677,4.325,4.843,5.287,5.722,6.362,6.996,7.234,6.858,6.757,6.703


In [82]:
Kenya = df[df["country_name"] == "Kenya"]

In [84]:
Kenya

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
522,Kenya,Female,15-24,Youth,7.304,7.341,7.371,9.379,11.284,13.167,15.919,16.147,16.172,16.178,16.112
523,Kenya,Female,25+,Adults,1.778,1.783,1.787,2.331,2.831,3.32,4.307,5.458,5.47,5.377,5.303
524,Kenya,Female,Under 15,Children,2.967,2.956,2.945,3.765,4.519,5.252,6.582,7.518,7.515,7.443,7.367
525,Kenya,Male,15-24,Youth,7.335,7.364,7.378,8.877,10.281,11.672,10.697,7.936,8.075,8.446,8.644
526,Kenya,Male,25+,Adults,1.345,1.348,1.347,1.989,2.585,3.165,3.294,2.935,2.815,2.696,2.609
527,Kenya,Male,Under 15,Children,2.606,2.596,2.584,3.364,4.08,4.773,4.696,3.878,3.795,3.774,3.743


In [94]:
KenyaFemale = Kenya[Kenya["sex"] == "Female"]

In [96]:
KenyaFemale

Unnamed: 0,country_name,sex,age_group,age_categories,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
522,Kenya,Female,15-24,Youth,7.304,7.341,7.371,9.379,11.284,13.167,15.919,16.147,16.172,16.178,16.112
523,Kenya,Female,25+,Adults,1.778,1.783,1.787,2.331,2.831,3.32,4.307,5.458,5.47,5.377,5.303
524,Kenya,Female,Under 15,Children,2.967,2.956,2.945,3.765,4.519,5.252,6.582,7.518,7.515,7.443,7.367


In [98]:
KenyaFemale2014 =df["2014"].mean()

In [100]:
KenyaFemale2014

7.121001116071428

In [122]:
KenyaFemale2015 = df["2015"].mean()
KenyaFemale2015

6.973352678571429

In [124]:
KenyaFemale2016 = df["2016"].mean()
KenyaFemale2016

6.768997767857143

In [126]:
KenyaFemale2017= df["2017"].mean()
KenyaFemale2017

6.499544642857143

In [128]:
KenyaFemale2018 = df["2018"].mean()
KenyaFemale2018

6.203744419642858

In [132]:
KenyaFemale2019 = df["2019"].mean()
KenyaFemale2019

6.094204241071429

In [140]:
KenyaFemale2020 = df["2020"].mean()
KenyaFemale2020

7.273703124999999

In [142]:
KenyaFemale2021 = df["2021"].mean()
KenyaFemale2021 

6.960870535714286

In [144]:
KenyaFemale2022 = df["2022"].mean()
KenyaFemale2022

6.119141741071428

In [146]:
KenyaFemale2023 = df["2023"].mean()
KenyaFemale2023

5.960043526785715

In [148]:
KenyaFemale2024 = df["2024"].mean()
KenyaFemale2024

5.928079241071428