# Unemployment rate

http://unstats.un.org/unsd/gender/data.html

Gender Info 2007 is a global database of gender statistics and indicators on a wide range of policy areas, including: population, families, health, education, work, and political participation. It can be used by governments, international organizations, advocacy groups, researchers and others in need of statistics for planning, analysis, advocacy and awareness-raising. 


# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

path = os.getcwd()                                  # get current working directory

# Options and Settings

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['font.size'] = 12                                       
warnings.simplefilter('ignore')

# Import Data

In [3]:
data_path = os.path.join(path, 'datasets', 'Unemployment rate.csv')

df = pd.read_csv(data_path)
df

Unnamed: 0,Country or Area,Subgroup,Year,Source,Unit,Value,Value Footnotes
0,Afghanistan,Female 15+ yr,2005.0,ILO_KILM Database 5ed_2007,Percent,9.5,1.0
1,Afghanistan,Male 15+ yr,2005.0,ILO_KILM Database 5ed_2007,Percent,7.6,1.0
2,Albania,Female 15+ yr,2004.0,ILO_KILM Database 5ed_2007,Percent,17.5,2.0
3,Albania,Female 15+ yr,2003.0,ILO_KILM Database 5ed_2007,Percent,18.3,2.0
4,Albania,Female 15+ yr,2001.0,ILO_KILM Database 5ed_2007,Percent,28.4,3.0
...,...,...,...,...,...,...,...
6195,426,Primary source: Household or labour force surv...,,,,,
6196,427,Primary source: Household or labour force surv...,,,,,
6197,428,Primary source: Household or labour force surv...,,,,,
6198,429,Primary source: Population census; Repository:...,,,,,


# Head and Tail

In [4]:
df = df[:5769]                                   # select index position 0-5769
df

Unnamed: 0,Country or Area,Subgroup,Year,Source,Unit,Value,Value Footnotes
0,Afghanistan,Female 15+ yr,2005.0,ILO_KILM Database 5ed_2007,Percent,9.5,1.0
1,Afghanistan,Male 15+ yr,2005.0,ILO_KILM Database 5ed_2007,Percent,7.6,1.0
2,Albania,Female 15+ yr,2004.0,ILO_KILM Database 5ed_2007,Percent,17.5,2.0
3,Albania,Female 15+ yr,2003.0,ILO_KILM Database 5ed_2007,Percent,18.3,2.0
4,Albania,Female 15+ yr,2001.0,ILO_KILM Database 5ed_2007,Percent,28.4,3.0
...,...,...,...,...,...,...,...
5764,Zimbabwe,Male 15+ yr,1987.0,ILO_KILM Database 5ed_2007,Percent,6.5,76.0
5765,Zimbabwe,Male 15-24 yr,2002.0,ILO_KILM Database 5ed_2007,Percent,28.2,59.0
5766,Zimbabwe,Male 15-24 yr,1999.0,ILO_KILM Database 5ed_2007,Percent,17.0,77.0
5767,Zimbabwe,Male 15-24 yr,1997.0,ILO_KILM Database 5ed_2007,Percent,20.7,34.0


In [5]:
df['Year'] = df['Year'].astype(int)                # convert year to integer
df

Unnamed: 0,Country or Area,Subgroup,Year,Source,Unit,Value,Value Footnotes
0,Afghanistan,Female 15+ yr,2005,ILO_KILM Database 5ed_2007,Percent,9.5,1.0
1,Afghanistan,Male 15+ yr,2005,ILO_KILM Database 5ed_2007,Percent,7.6,1.0
2,Albania,Female 15+ yr,2004,ILO_KILM Database 5ed_2007,Percent,17.5,2.0
3,Albania,Female 15+ yr,2003,ILO_KILM Database 5ed_2007,Percent,18.3,2.0
4,Albania,Female 15+ yr,2001,ILO_KILM Database 5ed_2007,Percent,28.4,3.0
...,...,...,...,...,...,...,...
5764,Zimbabwe,Male 15+ yr,1987,ILO_KILM Database 5ed_2007,Percent,6.5,76.0
5765,Zimbabwe,Male 15-24 yr,2002,ILO_KILM Database 5ed_2007,Percent,28.2,59.0
5766,Zimbabwe,Male 15-24 yr,1999,ILO_KILM Database 5ed_2007,Percent,17.0,77.0
5767,Zimbabwe,Male 15-24 yr,1997,ILO_KILM Database 5ed_2007,Percent,20.7,34.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5769 entries, 0 to 5768
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or Area  5769 non-null   object 
 1   Subgroup         5769 non-null   object 
 2   Year             5769 non-null   int32  
 3   Source           5769 non-null   object 
 4   Unit             5769 non-null   object 
 5   Value            5769 non-null   float64
 6   Value Footnotes  5769 non-null   float64
dtypes: float64(2), int32(1), object(4)
memory usage: 293.1+ KB


In [7]:
df.rename(columns={'Country or Area': 'country_or_area', 'Value': 'rate'}, inplace=True) 

df.columns = [col.lower() for col in df.columns]
df.head()  

Unnamed: 0,country_or_area,subgroup,year,source,unit,rate,value footnotes
0,Afghanistan,Female 15+ yr,2005,ILO_KILM Database 5ed_2007,Percent,9.5,1.0
1,Afghanistan,Male 15+ yr,2005,ILO_KILM Database 5ed_2007,Percent,7.6,1.0
2,Albania,Female 15+ yr,2004,ILO_KILM Database 5ed_2007,Percent,17.5,2.0
3,Albania,Female 15+ yr,2003,ILO_KILM Database 5ed_2007,Percent,18.3,2.0
4,Albania,Female 15+ yr,2001,ILO_KILM Database 5ed_2007,Percent,28.4,3.0


In [8]:
df_sub = df[['country_or_area', 'subgroup', 'year', 'unit', 'rate']]
df_sub

Unnamed: 0,country_or_area,subgroup,year,unit,rate
0,Afghanistan,Female 15+ yr,2005,Percent,9.5
1,Afghanistan,Male 15+ yr,2005,Percent,7.6
2,Albania,Female 15+ yr,2004,Percent,17.5
3,Albania,Female 15+ yr,2003,Percent,18.3
4,Albania,Female 15+ yr,2001,Percent,28.4
...,...,...,...,...,...
5764,Zimbabwe,Male 15+ yr,1987,Percent,6.5
5765,Zimbabwe,Male 15-24 yr,2002,Percent,28.2
5766,Zimbabwe,Male 15-24 yr,1999,Percent,17.0
5767,Zimbabwe,Male 15-24 yr,1997,Percent,20.7


# Descriptive Statistics

In [9]:
df_sub['rate'].describe()

count    5769.000000
mean       13.631340
std        10.154543
min         0.000000
25%         6.300000
50%        10.800000
75%        18.300000
max        74.200000
Name: rate, dtype: float64

# Discretization and quantiling

In [10]:
df_sub['rate_cat'] = pd.cut(df_sub['rate'], [0, 5, 10, 20, np.inf], labels=['very low', 'low', 'high', 'very high'])
df_sub

Unnamed: 0,country_or_area,subgroup,year,unit,rate,rate_cat
0,Afghanistan,Female 15+ yr,2005,Percent,9.5,low
1,Afghanistan,Male 15+ yr,2005,Percent,7.6,low
2,Albania,Female 15+ yr,2004,Percent,17.5,high
3,Albania,Female 15+ yr,2003,Percent,18.3,high
4,Albania,Female 15+ yr,2001,Percent,28.4,very high
...,...,...,...,...,...,...
5764,Zimbabwe,Male 15+ yr,1987,Percent,6.5,low
5765,Zimbabwe,Male 15-24 yr,2002,Percent,28.2,very high
5766,Zimbabwe,Male 15-24 yr,1999,Percent,17.0,high
5767,Zimbabwe,Male 15-24 yr,1997,Percent,20.7,very high


In [11]:
df_sub['rate_cat'].value_counts()

high         1878
low          1665
very high    1213
very low     1012
Name: rate_cat, dtype: int64

# Dataframe Grouping

In [12]:
cnt_area_grp = df_sub.groupby('country_or_area')
cnt_area_grp.head()

Unnamed: 0,country_or_area,subgroup,year,unit,rate,rate_cat
0,Afghanistan,Female 15+ yr,2005,Percent,9.5,low
1,Afghanistan,Male 15+ yr,2005,Percent,7.6,low
2,Albania,Female 15+ yr,2004,Percent,17.5,high
3,Albania,Female 15+ yr,2003,Percent,18.3,high
4,Albania,Female 15+ yr,2001,Percent,28.4,very high
...,...,...,...,...,...,...
5751,Zimbabwe,Female 15+ yr,2002,Percent,6.1,low
5752,Zimbabwe,Female 15+ yr,1999,Percent,4.6,very low
5753,Zimbabwe,Female 15+ yr,1997,Percent,5.1,low
5754,Zimbabwe,Female 15+ yr,1994,Percent,3.0,very low


In [13]:
mean_unemp =  round(cnt_area_grp['rate'].agg([np.mean]).rename(columns={'mean': 'avg_unemp_rate'}), 2)
mean_unemp         

Unnamed: 0_level_0,avg_unemp_rate
country_or_area,Unnamed: 1_level_1
Afghanistan,8.55
Albania,22.16
Algeria,26.94
Anguilla,10.17
Argentina,16.72
...,...
Venezuela,15.57
Vietnam,3.49
Yemen,10.35
Zambia,16.27


In [14]:
mean_unemp['avg_unemp_rate'].sort_values(ascending=False).head(20)      # top 10 country/area with the highest unemployment rate

country_or_area
Macedonia                        47.33
Djibouti                         44.30
Lesotho                          38.12
Armenia                          36.20
Reunion                          35.11
Seychelles                       34.40
Swaziland                        33.35
South Africa                     32.87
Namibia                          30.56
St Vincent and the Grenadines    29.08
Guadeloupe                       27.66
Saint Lucia                      27.64
Algeria                          26.94
New Caledonia                    26.70
Grenada                          26.50
French Guiana                    26.33
Spain                            26.06
Martinique                       25.31
Botswana                         24.94
St. Helena                       24.90
Name: avg_unemp_rate, dtype: float64

In [15]:
mean_unemp['avg_unemp_rate'].sort_values(ascending=True).head(20)      # top 10 country/area with the lowest unemployment rate

country_or_area
Chad                    0.70
Rwanda                  0.81
Malawi                  0.83
Burkina Faso            1.83
Maldives                2.45
Guatemala               2.94
Isle of Man             3.02
Lao PDR                 3.03
Guinea                  3.15
Uganda                  3.20
Thailand                3.41
Vietnam                 3.49
United Arab Emirates    3.50
Bangladesh              4.28
Tanzania                4.32
Malaysia                4.62
Mexico                  4.63
Austria                 4.65
Nepal                   4.67
Switzerland             4.69
Name: avg_unemp_rate, dtype: float64