# Wages and salaries

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Options and Settings

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['font.size'] = 12
path = os.getcwd()                                  # get current working directory
warnings.simplefilter('ignore')

# Import Data

In [3]:
filepath = os.path.join(path, 'datasets', 'Wages and salaries.csv')

df = pd.read_csv(filepath)

# Head and Tail

In [4]:
df

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011.0,Wages and salaries paid to employees,335228000.0,
1,Afghanistan,Afghani,154 Other food products,2010.0,Wages and salaries paid to employees,322938000.0,
2,Afghanistan,Afghani,154 Other food products,2009.0,Wages and salaries paid to employees,343440000.0,
3,Afghanistan,Afghani,154 Other food products,2008.0,Wages and salaries paid to employees,322376000.0,
4,Afghanistan,Afghani,154 Other food products,2007.0,Wages and salaries paid to employees,305280000.0,
...,...,...,...,...,...,...,...
21801,128,"273 includes 2710, 2720",,,,,
21802,129,"359 includes 351, 3520, 3530",,,,,
21803,130,"2021 includes 2022, 2023",,,,,
21804,131,"2211 includes 2213, 2219, 2221, 2222",,,,,


In [5]:
df1 = df[:21673]                                 # select index position 0-21673
df1

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011.0,Wages and salaries paid to employees,3.352280e+08,
1,Afghanistan,Afghani,154 Other food products,2010.0,Wages and salaries paid to employees,3.229380e+08,
2,Afghanistan,Afghani,154 Other food products,2009.0,Wages and salaries paid to employees,3.434400e+08,
3,Afghanistan,Afghani,154 Other food products,2008.0,Wages and salaries paid to employees,3.223760e+08,
4,Afghanistan,Afghani,154 Other food products,2007.0,Wages and salaries paid to employees,3.052800e+08,
...,...,...,...,...,...,...,...
21668,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2009.0,Wages and salaries paid to employees,4.332000e+06,
21669,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2008.0,Wages and salaries paid to employees,1.444000e+06,
21670,Yemen,Rials,D Total manufacturing,2009.0,Wages and salaries paid to employees,4.845004e+10,
21671,Yemen,Rials,D Total manufacturing,2008.0,Wages and salaries paid to employees,3.902640e+10,


In [6]:
df1['Year'] = np.int64(df1['Year'])                      # convert year to numpy int64 data type
df1

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011,Wages and salaries paid to employees,3.352280e+08,
1,Afghanistan,Afghani,154 Other food products,2010,Wages and salaries paid to employees,3.229380e+08,
2,Afghanistan,Afghani,154 Other food products,2009,Wages and salaries paid to employees,3.434400e+08,
3,Afghanistan,Afghani,154 Other food products,2008,Wages and salaries paid to employees,3.223760e+08,
4,Afghanistan,Afghani,154 Other food products,2007,Wages and salaries paid to employees,3.052800e+08,
...,...,...,...,...,...,...,...
21668,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2009,Wages and salaries paid to employees,4.332000e+06,
21669,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2008,Wages and salaries paid to employees,1.444000e+06,
21670,Yemen,Rials,D Total manufacturing,2009,Wages and salaries paid to employees,4.845004e+10,
21671,Yemen,Rials,D Total manufacturing,2008,Wages and salaries paid to employees,3.902640e+10,


In [7]:
df1.shape

(21673, 7)

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21673 entries, 0 to 21672
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or Area  21673 non-null  object 
 1   Currency         21673 non-null  object 
 2   ISIC Rev 3       21673 non-null  object 
 3   Year             21673 non-null  int64  
 4   Measure          21673 non-null  object 
 5   Value            21673 non-null  float64
 6   Value Footnotes  962 non-null    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.2+ MB


In [9]:
df1.rename(columns={'Country or Area': 'Country_or_Area'}, inplace=True) 
df1.head() 

Unnamed: 0,Country_or_Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011,Wages and salaries paid to employees,335228000.0,
1,Afghanistan,Afghani,154 Other food products,2010,Wages and salaries paid to employees,322938000.0,
2,Afghanistan,Afghani,154 Other food products,2009,Wages and salaries paid to employees,343440000.0,
3,Afghanistan,Afghani,154 Other food products,2008,Wages and salaries paid to employees,322376000.0,
4,Afghanistan,Afghani,154 Other food products,2007,Wages and salaries paid to employees,305280000.0,


In [10]:
df2 = df1[['Country_or_Area', 'Currency', 'Year', 'Measure', 'Value']]
df2

Unnamed: 0,Country_or_Area,Currency,Year,Measure,Value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,3.352280e+08
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,3.229380e+08
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,3.434400e+08
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,3.223760e+08
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,3.052800e+08
...,...,...,...,...,...
21668,Yemen,Rials,2009,Wages and salaries paid to employees,4.332000e+06
21669,Yemen,Rials,2008,Wages and salaries paid to employees,1.444000e+06
21670,Yemen,Rials,2009,Wages and salaries paid to employees,4.845004e+10
21671,Yemen,Rials,2008,Wages and salaries paid to employees,3.902640e+10


In [11]:
df2.describe(include='object')

Unnamed: 0,Country_or_Area,Currency,Measure
count,21673,21673,21673
unique,92,49,1
top,Eritrea,Euros,Wages and salaries paid to employees
freq,759,3915,21673


In [12]:
df2['Country_or_Area'].value_counts()

Eritrea                             759
Ukraine                             748
The f. Yugosl. Rep of Macedonia     715
Russian Federation                  707
Jordan                              608
                                   ... 
Tunisia                              17
Curaçao                               4
Netherlands Antilles                  2
Armenia                               2
Cook Islands                          1
Name: Country_or_Area, Length: 92, dtype: int64

In [13]:
df2['Currency'].value_counts().head(10)

Euros         3915
Rials         1167
US Dollars    1014
Pesos          996
Dinars         884
Dollars        864
Roubles        782
Nakfa          759
Hryvnias       748
Denars         715
Name: Currency, dtype: int64

In [14]:
df2['Year'].value_counts()

2007    8121
2008    6836
2009    3167
2010    2481
2011    1068
Name: Year, dtype: int64

In [15]:
df2['Value'].describe()

count    2.167300e+04
mean     1.448376e+11
std      2.430735e+12
min      0.000000e+00
25%      8.000000e+06
50%      1.810000e+08
75%      2.061000e+09
max      1.647257e+14
Name: Value, dtype: float64

# Missing Values Exploration

In [16]:
df2.isna().sum()

Country_or_Area    0
Currency           0
Year               0
Measure            0
Value              0
dtype: int64

# Grouping and Aggregation

In [17]:
currency_grouped = df2.groupby('Currency')
currency_grouped.head(10)

Unnamed: 0,Country_or_Area,Currency,Year,Measure,Value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,3.352280e+08
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,3.229380e+08
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,3.434400e+08
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,3.223760e+08
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,3.052800e+08
...,...,...,...,...,...
20902,Viet Nam,Dongs,2009,Wages and salaries paid to employees,4.926080e+11
20903,Viet Nam,Dongs,2008,Wages and salaries paid to employees,4.443230e+11
20904,Viet Nam,Dongs,2007,Wages and salaries paid to employees,3.346300e+11
20905,Viet Nam,Dongs,2010,Wages and salaries paid to employees,6.586146e+12


# Country or Area with Euros as Currency

In [18]:
euro_df = currency_grouped.get_group('Euros')                                       # get countries with euros as currency
euro_country = euro_df.groupby('Country_or_Area')                                   # group by country or area
euro_country.head()

Unnamed: 0,Country_or_Area,Currency,Year,Measure,Value
536,Austria,Euros,2008,Wages and salaries paid to employees,5.480000e+08
537,Austria,Euros,2008,Wages and salaries paid to employees,3.920000e+08
538,Austria,Euros,2007,Wages and salaries paid to employees,3.760000e+08
539,Austria,Euros,2008,Wages and salaries paid to employees,4.000000e+06
540,Austria,Euros,2008,Wages and salaries paid to employees,1.360000e+08
...,...,...,...,...,...
17327,Spain,Euros,2008,Wages and salaries paid to employees,3.171000e+09
17328,Spain,Euros,2007,Wages and salaries paid to employees,3.018000e+09
17329,Spain,Euros,2008,Wages and salaries paid to employees,1.832000e+09
17330,Spain,Euros,2007,Wages and salaries paid to employees,1.717000e+09


In [19]:
agg_stats_euro_country = euro_country.agg(
    min_value = pd.NamedAgg('Value', np.min),
    mean_value = pd.NamedAgg('Value', np.mean),
    max_value = pd.NamedAgg('Value', np.max),
    total_value = pd.NamedAgg('Value', np.sum)
)

agg_stats_euro_country

Unnamed: 0_level_0,min_value,mean_value,max_value,total_value
Country_or_Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria,0.0,431695200.0,23282000000.0,116126000000.0
Belgium,1000000.0,410215100.0,21910000000.0,114450000000.0
Cyprus,0.0,13217410.0,610861000.0,1705046000.0
Estonia,0.0,24682470.0,1039114000.0,2295470000.0
Finland,1000000.0,301888500.0,15640000000.0,78491000000.0
France,16000000.0,1959290000.0,112322000000.0,581909000000.0
Germany,28000000.0,4827523000.0,271467000000.0,1366189000000.0
Greece,1000000.0,120301400.0,6635000000.0,17564000000.0
Ireland,0.0,185524500.0,8401000000.0,37847000000.0
Italy,0.0,1779650000.0,100960000000.0,523217000000.0


In [20]:
agg_stats_euro_country['total_value'].sort_values(ascending=False).head(10)          # top 10 total value by country or area        

Country_or_Area
Germany         1.366189e+12
France          5.819090e+11
Italy           5.232170e+11
Spain           3.283860e+11
Netherlands     1.454430e+11
Austria         1.161260e+11
Belgium         1.144500e+11
Finland         7.849100e+10
Portugal        4.212200e+10
Ireland         3.784700e+10
Name: total_value, dtype: float64

In [21]:
agg_stats_euro_country['total_value'].sort_values(ascending=True).head(10)            # bottom 10 total value by country or area    

Country_or_Area
Cyprus         1.705046e+09
Malta          2.024270e+09
Latvia         2.216297e+09
Estonia        2.295470e+09
Luxembourg     5.201000e+09
Slovenia       9.440000e+09
Slovakia       1.682000e+10
Greece         1.756400e+10
Ireland        3.784700e+10
Portugal       4.212200e+10
Name: total_value, dtype: float64

# Country or Area with US Dollars as Currency

In [22]:
usd_df = currency_grouped.get_group('US Dollars')                                 # get countries with us dollars as currency
usd_country = usd_df.groupby('Country_or_Area')                                   # group by country or area
usd_country.head()

Unnamed: 0,Country_or_Area,Currency,Year,Measure,Value
4048,Ecuador,US Dollars,2008,Wages and salaries paid to employees,158113000.0
4049,Ecuador,US Dollars,2007,Wages and salaries paid to employees,173356000.0
4050,Ecuador,US Dollars,2008,Wages and salaries paid to employees,16795000.0
4051,Ecuador,US Dollars,2007,Wages and salaries paid to employees,16533000.0
4052,Ecuador,US Dollars,2008,Wages and salaries paid to employees,86640000.0
11137,Lebanon,US Dollars,2007,Wages and salaries paid to employees,29460000.0
11138,Lebanon,US Dollars,2007,Wages and salaries paid to employees,10810000.0
11139,Lebanon,US Dollars,2007,Wages and salaries paid to employees,0.0
11140,Lebanon,US Dollars,2007,Wages and salaries paid to employees,17301000.0
11141,Lebanon,US Dollars,2007,Wages and salaries paid to employees,1349000.0


In [23]:
agg_stats_usd_country = usd_country.agg(
    min_value = pd.NamedAgg('Value', np.min),
    mean_value = pd.NamedAgg('Value', np.mean),
    max_value = pd.NamedAgg('Value', np.max),
    total_value = pd.NamedAgg('Value', np.sum)
)

agg_stats_usd_country

Unnamed: 0_level_0,min_value,mean_value,max_value,total_value
Country_or_Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ecuador,0.0,16307320.0,932256000.0,4957426000.0
Lebanon,0.0,9696689.0,540662000.0,1464200000.0
State of Palestine,2000.0,4653814.0,206747000.0,1326337000.0
United States of America,100278000.0,11707070000.0,611872900000.0,3207737000000.0


In [24]:
agg_stats_usd_country['total_value'].sort_values(ascending=False)             

Country_or_Area
United States of America     3.207737e+12
Ecuador                      4.957426e+09
Lebanon                      1.464200e+09
State of Palestine           1.326337e+09
Name: total_value, dtype: float64

# Country or Area with Dollars as Currency

In [25]:
dollars_df = currency_grouped.get_group('Dollars')                                 # get countries with dollars as currency
dollars_country = dollars_df.groupby('Country_or_Area')                            # group by country or area
dollars_country.head()

Unnamed: 0,Country_or_Area,Currency,Year,Measure,Value
147,Australia,Dollars,2011,Wages and salaries paid to employees,2743000000.0
148,Australia,Dollars,2010,Wages and salaries paid to employees,2612000000.0
149,Australia,Dollars,2009,Wages and salaries paid to employees,2820000000.0
150,Australia,Dollars,2008,Wages and salaries paid to employees,2711000000.0
151,Australia,Dollars,2007,Wages and salaries paid to employees,2492000000.0
1803,Brunei Darussalam,Dollars,2010,Wages and salaries paid to employees,1041800.0
1804,Brunei Darussalam,Dollars,2010,Wages and salaries paid to employees,440300.0
1805,Brunei Darussalam,Dollars,2010,Wages and salaries paid to employees,22300.0
1806,Brunei Darussalam,Dollars,2010,Wages and salaries paid to employees,94800.0
1807,Brunei Darussalam,Dollars,2010,Wages and salaries paid to employees,5789600.0


In [26]:
agg_stats_dollars_country = dollars_country.agg(
    min_value = pd.NamedAgg('Value', np.min),
    mean_value = pd.NamedAgg('Value', np.mean),
    max_value = pd.NamedAgg('Value', np.max),
    total_value = pd.NamedAgg('Value', np.sum)
)

agg_stats_dollars_country

Unnamed: 0_level_0,min_value,mean_value,max_value,total_value
Country_or_Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,12000000.0,1366589000.0,53158000000.0,531603000000.0
Brunei Darussalam,22300.0,7284240.0,127474200.0,254948400.0
Canada,8540000.0,1622582000.0,83468320000.0,386174400000.0
Cook Islands,4973000.0,4973000.0,4973000.0,4973000.0
New Zealand,89834000.0,1512010000.0,12005330000.0,45360300000.0
Singapore,1447000.0,471254200.0,17736320000.0,80584470000.0


In [27]:
agg_stats_dollars_country['total_value'].sort_values(ascending=False).head(10)          # top 10 total value by country or area    

Country_or_Area
Australia             5.316030e+11
Canada                3.861744e+11
Singapore             8.058447e+10
New Zealand           4.536030e+10
Brunei Darussalam     2.549484e+08
Cook Islands          4.973000e+06
Name: total_value, dtype: float64