# Wages and salaries

http://data.un.org/Data.aspx?d=UNIDO&f=tableCode%3a05
http://data.un.org/_Docs/INDSTAT2014notes.xls

INDSTAT4-Rev.3 contains time series data on five selected data items for the period covering 2007 - 2011. The data are arranged at the 3- and 4-digit level of ISIC (Revision 3) pertaining to the manufacturing sector, which comprises 151 manufacturing categories.

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Options and Settings

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.autolayout'] = True
plt.rcParams['font.size'] = 12
path = os.getcwd()                                  # get current working directory
warnings.simplefilter('ignore')

# Import Data

In [3]:
filepath = os.path.join('datasets', 'Wages and salaries.csv')

df = pd.read_csv(filepath)

# Head and Tail

In [4]:
df

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011.0,Wages and salaries paid to employees,335228000.0,
1,Afghanistan,Afghani,154 Other food products,2010.0,Wages and salaries paid to employees,322938000.0,
2,Afghanistan,Afghani,154 Other food products,2009.0,Wages and salaries paid to employees,343440000.0,
3,Afghanistan,Afghani,154 Other food products,2008.0,Wages and salaries paid to employees,322376000.0,
4,Afghanistan,Afghani,154 Other food products,2007.0,Wages and salaries paid to employees,305280000.0,
...,...,...,...,...,...,...,...
21801,128,"273 includes 2710, 2720",,,,,
21802,129,"359 includes 351, 3520, 3530",,,,,
21803,130,"2021 includes 2022, 2023",,,,,
21804,131,"2211 includes 2213, 2219, 2221, 2222",,,,,


In [5]:
df = df[:21673]                                 # select index position 0-21673
df

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011.0,Wages and salaries paid to employees,3.352280e+08,
1,Afghanistan,Afghani,154 Other food products,2010.0,Wages and salaries paid to employees,3.229380e+08,
2,Afghanistan,Afghani,154 Other food products,2009.0,Wages and salaries paid to employees,3.434400e+08,
3,Afghanistan,Afghani,154 Other food products,2008.0,Wages and salaries paid to employees,3.223760e+08,
4,Afghanistan,Afghani,154 Other food products,2007.0,Wages and salaries paid to employees,3.052800e+08,
...,...,...,...,...,...,...,...
21668,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2009.0,Wages and salaries paid to employees,4.332000e+06,
21669,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2008.0,Wages and salaries paid to employees,1.444000e+06,
21670,Yemen,Rials,D Total manufacturing,2009.0,Wages and salaries paid to employees,4.845004e+10,
21671,Yemen,Rials,D Total manufacturing,2008.0,Wages and salaries paid to employees,3.902640e+10,


In [6]:
df['Year'] = np.int64(df['Year'])                      # convert year to numpy int64 data type
df

Unnamed: 0,Country or Area,Currency,ISIC Rev 3,Year,Measure,Value,Value Footnotes
0,Afghanistan,Afghani,154 Other food products,2011,Wages and salaries paid to employees,3.352280e+08,
1,Afghanistan,Afghani,154 Other food products,2010,Wages and salaries paid to employees,3.229380e+08,
2,Afghanistan,Afghani,154 Other food products,2009,Wages and salaries paid to employees,3.434400e+08,
3,Afghanistan,Afghani,154 Other food products,2008,Wages and salaries paid to employees,3.223760e+08,
4,Afghanistan,Afghani,154 Other food products,2007,Wages and salaries paid to employees,3.052800e+08,
...,...,...,...,...,...,...,...
21668,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2009,Wages and salaries paid to employees,4.332000e+06,
21669,Yemen,Rials,3720 Recycling of non-metal waste and scrap,2008,Wages and salaries paid to employees,1.444000e+06,
21670,Yemen,Rials,D Total manufacturing,2009,Wages and salaries paid to employees,4.845004e+10,
21671,Yemen,Rials,D Total manufacturing,2008,Wages and salaries paid to employees,3.902640e+10,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21673 entries, 0 to 21672
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or Area  21673 non-null  object 
 1   Currency         21673 non-null  object 
 2   ISIC Rev 3       21673 non-null  object 
 3   Year             21673 non-null  int64  
 4   Measure          21673 non-null  object 
 5   Value            21673 non-null  float64
 6   Value Footnotes  962 non-null    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.2+ MB


In [8]:
df_sub = df[['Country or Area', 'Currency', 'Year', 'Measure', 'Value']]
df_sub

Unnamed: 0,Country or Area,Currency,Year,Measure,Value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,3.352280e+08
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,3.229380e+08
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,3.434400e+08
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,3.223760e+08
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,3.052800e+08
...,...,...,...,...,...
21668,Yemen,Rials,2009,Wages and salaries paid to employees,4.332000e+06
21669,Yemen,Rials,2008,Wages and salaries paid to employees,1.444000e+06
21670,Yemen,Rials,2009,Wages and salaries paid to employees,4.845004e+10
21671,Yemen,Rials,2008,Wages and salaries paid to employees,3.902640e+10


In [9]:
df_sub.rename(columns={'Country or Area': 'country_or_area'}, inplace=True) 
df_sub.head() 

Unnamed: 0,country_or_area,Currency,Year,Measure,Value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,335228000.0
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,322938000.0
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,343440000.0
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,322376000.0
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,305280000.0


In [10]:
df_sub.columns = [col.lower() for col in df_sub.columns]                    # convert column names to lower case
df_sub

Unnamed: 0,country_or_area,currency,year,measure,value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,3.352280e+08
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,3.229380e+08
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,3.434400e+08
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,3.223760e+08
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,3.052800e+08
...,...,...,...,...,...
21668,Yemen,Rials,2009,Wages and salaries paid to employees,4.332000e+06
21669,Yemen,Rials,2008,Wages and salaries paid to employees,1.444000e+06
21670,Yemen,Rials,2009,Wages and salaries paid to employees,4.845004e+10
21671,Yemen,Rials,2008,Wages and salaries paid to employees,3.902640e+10


In [11]:
df_sub.describe(include='object')

Unnamed: 0,country_or_area,currency,measure
count,21673,21673,21673
unique,92,49,1
top,Eritrea,Euros,Wages and salaries paid to employees
freq,759,3915,21673


In [12]:
df_sub['country_or_area'].value_counts()

Eritrea                             759
Ukraine                             748
The f. Yugosl. Rep of Macedonia     715
Russian Federation                  707
Jordan                              608
                                   ... 
Tunisia                              17
Curaçao                               4
Netherlands Antilles                  2
Armenia                               2
Cook Islands                          1
Name: country_or_area, Length: 92, dtype: int64

In [13]:
df_sub['currency'].value_counts()

Euros           3915
Rials           1167
US Dollars      1014
Pesos            996
Dinars           884
Dollars          864
Roubles          782
Nakfa            759
Hryvnias         748
Denars           715
Dongs            605
Soms             589
Yuan             579
Dirhams          561
Rupees           503
Lari             501
Kroner           488
Manat            452
Rupiahs          441
Birr             430
Kwacha           376
Pounds           373
Lei              369
Litas            304
Forints          301
Ringgits         298
Zlotys           296
Kronor           270
Liras            248
Togrog           224
New Sheqalim     172
Riyals           156
Yen              142
Shillings        139
Leva             139
Rand             136
Pula             131
Koruny           111
Leks              95
Patacas           80
Francs            75
Reais             61
Maloti            55
Afghani           50
Tenge             24
Kunas             24
New Soles         23
Guilders     

# Descriptive Statistics

In [14]:
df_sub['value'].describe()

count    2.167300e+04
mean     1.448376e+11
std      2.430735e+12
min      0.000000e+00
25%      8.000000e+06
50%      1.810000e+08
75%      2.061000e+09
max      1.647257e+14
Name: value, dtype: float64

# Dataframe Grouping

In [15]:
cnt_area_grp = df_sub.groupby(['country_or_area', 'currency'])    
cnt_area_grp.head()

Unnamed: 0,country_or_area,currency,year,measure,value
0,Afghanistan,Afghani,2011,Wages and salaries paid to employees,3.352280e+08
1,Afghanistan,Afghani,2010,Wages and salaries paid to employees,3.229380e+08
2,Afghanistan,Afghani,2009,Wages and salaries paid to employees,3.434400e+08
3,Afghanistan,Afghani,2008,Wages and salaries paid to employees,3.223760e+08
4,Afghanistan,Afghani,2007,Wages and salaries paid to employees,3.052800e+08
...,...,...,...,...,...
21502,Yemen,Rials,2007,Wages and salaries paid to employees,1.138600e+10
21503,Yemen,Rials,2009,Wages and salaries paid to employees,3.264150e+08
21504,Yemen,Rials,2008,Wages and salaries paid to employees,3.224170e+08
21505,Yemen,Rials,2009,Wages and salaries paid to employees,1.635780e+08


In [16]:
cnt_area_grp_stats = cnt_area_grp['value'].agg([np.min, np.max, np.mean, np.sum, np.var, np.std]).rename(columns={
    'amin': 'agg_min', 'amax': 'agg_max', 'sum': 'agg_sum', 'mean': 'agg_mean'})

cnt_area_grp_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,agg_min,agg_max,agg_mean,agg_sum,var,std
country_or_area,currency,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,Afghani,9.216000e+07,2.448030e+09,4.491617e+08,2.245809e+10,3.880294e+17,6.229200e+08
Albania,Leks,2.596200e+07,1.958858e+10,1.852373e+09,1.759754e+11,1.473922e+19,3.839170e+09
Armenia,Drams,7.183400e+10,8.349100e+10,7.766250e+10,1.553250e+11,6.794282e+19,8.242744e+09
Australia,Dollars,1.200000e+07,5.315800e+10,1.366589e+09,5.316030e+11,3.423873e+19,5.851387e+09
Austria,Euros,0.000000e+00,2.328200e+10,4.316952e+08,1.161260e+11,3.875231e+18,1.968561e+09
...,...,...,...,...,...,...,...
United Republic of Tanzania,Shillings,4.169533e+06,1.919870e+11,1.070052e+10,3.745183e+11,1.061712e+21,3.258393e+10
United States of America,US Dollars,1.002780e+08,6.118729e+11,1.170707e+10,3.207737e+12,2.714315e+21,5.209909e+10
Uruguay,Pesos,0.000000e+00,1.998436e+10,6.114221e+08,5.319372e+10,4.659671e+18,2.158627e+09
Viet Nam,Dongs,0.000000e+00,1.647257e+14,1.980671e+12,1.198306e+15,1.050501e+26,1.024939e+13


In [17]:
df_sub.groupby('country_or_area')[['country_or_area', 'currency', 'year', 'value']].head()

Unnamed: 0,country_or_area,currency,year,value
0,Afghanistan,Afghani,2011,3.352280e+08
1,Afghanistan,Afghani,2010,3.229380e+08
2,Afghanistan,Afghani,2009,3.434400e+08
3,Afghanistan,Afghani,2008,3.223760e+08
4,Afghanistan,Afghani,2007,3.052800e+08
...,...,...,...,...
21502,Yemen,Rials,2007,1.138600e+10
21503,Yemen,Rials,2009,3.264150e+08
21504,Yemen,Rials,2008,3.224170e+08
21505,Yemen,Rials,2009,1.635780e+08
