In [40]:
import pandas as pd
import numpy as np
import re #Regex lib
from forex_python.converter import CurrencyRates #currencies lib
from datetime import datetime
import matplotlib

### Read in the dataframes and create a dict {(year, dataframe)}


In [41]:
# Create a list of filenames for the datasets

data_frames_dict = {}

data_frames_dict[2017] = pd.read_csv('datasets/2017.csv', encoding = "ISO-8859-1").filter(items=['Country', 'Currency',  'Salary', 'FormalEducation']).dropna(thresh=4) 
data_frames_dict[2018] = pd.read_csv('datasets/2018.csv', encoding = "ISO-8859-1").filter(items=['Country',  'ConvertedSalary', 'FormalEducation']).dropna(thresh=3) 
data_frames_dict[2019] = pd.read_csv('datasets/2019.csv', encoding = "ISO-8859-1").filter (items=['Country',  'CompTotal', 'EdLevel']).dropna(thresh=3) 
data_frames_dict[2020] = pd.read_csv('datasets/2020.csv', encoding = "ISO-8859-1").filter (items=['Country',  'CompTotal', 'EdLevel']).dropna(thresh=3) 
data_frames_dict[2021] = pd.read_csv('datasets/2021.csv', encoding = "ISO-8859-1").filter (items=['Country','CompTotal', 'EdLevel']).dropna(thresh=3) 


#preprocess 2021
def check_countryType(country, US_state, UK_country):
    
    if  isinstance(US_state, str):
        return 'United States'
    elif isinstance(UK_country, str) or country == 'United Kingdom of Great Britain and Northern Ireland':
        return 'United Kingdom'
    else:
        return country


df_2021 = pd.read_csv('datasets/2021.csv', encoding = "ISO-8859-1").filter (items=['Country', 'US_State', 'UK_Country',  'CompTotal', 'EdLevel'])
df_2021 = df_2021.dropna(subset = ['CompTotal', 'EdLevel'])
df_2021.Country =  df_2021.apply(lambda row : check_countryType(row['Country'],row['US_State'],  
                     row['UK_Country']), axis=1)
df_2021 = df_2021.drop(['US_State', 'UK_Country'], axis = 1).dropna(thresh=3)
data_frames_dict[2021]  = df_2021
                                 


  data_frames_dict[2018] = pd.read_csv('datasets/2018.csv', encoding = "ISO-8859-1").filter(items=['Country',  'ConvertedSalary', 'FormalEducation']).dropna(thresh=3)


In [42]:
ed_dict =  {'Secondary school': 'Secondary school',
 "Some college/university study without earning a bachelor's degree" : "college/university without bachelor's degree",
 "Bachelor's degree": "Bachelor's degree",
 'Doctoral degree': 'Doctoral degree',
 "Master's degree": "Master's degree",
 'Professional degree': "Professional degree",
 'Primary/elementary school': 'Primary/elementary school',
 'I prefer not to answer': 'I prefer not to answer',
 'I never completed any formal education': 'No formal education', 
 'Bachelorâ\x80\x99s degree (BA, BS, B.Eng., etc.)': "Bachelor's degree",
 'Associate degree': 'Associate degree',
 'Some college/university study without earning a degree': "college/university without bachelor's degree",
 'Masterâ\x80\x99s degree (MA, MS, M.Eng., MBA, etc.)': "Master's degree",
 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 'Secondary school',
 'Professional degree (JD, MD, etc.)': "Professional degree",
 'I never completed any formal education': 'No formal education', 
 'Other doctoral degree (Ph.D, Ed.D., etc.)':  'Doctoral degree',
 'Associate degree (A.A., A.S., etc.)': 'Associate degree',
 'Something else': 'Something else', 
 'Masterâ\x80\x99s degree (M.A., M.S., M.Eng., MBA, etc.)': "Master's degree",
 'Bachelorâ\x80\x99s degree (B.A., B.S., B.Eng., etc.)' : "Bachelor's degree",
 'Other doctoral degree (Ph.D., Ed.D., etc.)' : 'Doctoral degree'
}



for year, df in data_frames_dict.items():
    print(year)
    if 'EdLevel'not in df:
        print("here")
        df.columns = df.columns.str.replace("FormalEducation","EdLevel")
    print(df.columns)
    df["EdLevel"] = df.EdLevel.apply(lambda x: ed_dict[x])
        


2017
here
Index(['Country', 'Currency', 'Salary', 'EdLevel'], dtype='object')
2018
here
Index(['Country', 'ConvertedSalary', 'EdLevel'], dtype='object')
2019
Index(['Country', 'CompTotal', 'EdLevel'], dtype='object')
2020
Index(['Country', 'CompTotal', 'EdLevel'], dtype='object')
2021
Index(['Country', 'CompTotal', 'EdLevel'], dtype='object')


# Section 1: Salary preprocessing

### The following currencies were found in the datasets and could not be converted to their equivalent currency symbols

In [43]:
currency_dict = {'Australian dollars (A$)': 'AUD',
 'Bitcoin (btc)': 'btc',
 'Brazilian reais (R$)': 'BRL',
 'British pounds sterling (Â£)': 'GBP',
 'Canadian dollars (C$)': 'CAD',
 'Chinese yuan renminbi (Â¥)': 'CNY',
 'Euros (â\x82¬)': 'EUR',
 'Indian rupees (?)': 'INR',
 'Japanese yen (Â¥)': 'JPY',
 'Polish zloty (zl)': 'PLN',
 'Russian rubles (?)': 'RUB',
 'Singapore dollars (S$)': 'SGD',
 'South African rands (R)': 'ZAR',
 'Swiss francs': 'CHF',
 'U.S. dollars ($)': 'USD',
 'none\tCook Islands dollar': 'NZD'}

### Compute currency symbols and add them to the 'currency_dict'

In [44]:
#helper function to covert currency to its symbol 
def add_currencies_to_dict(currency, currency_dict):
        currency_symbol = re.findall(r'[ˆA-Z]{3}', currency )

        if currency_symbol:
            # redundnat to add symbol to a dict {symbol -> symbol}but helps to treat all currencies the same
            currency_dict[currency] = currency_symbol[0]
        else:
            if currency not in currency_dict:
                print("WARN: there is no conversion available for this currency {}".format(currency))


# go through the  'CurrencySymbol' column if exists, otherwise 'Currency' 
# add all currencies that are not NaN to the 'currency_dict'
for year , df in data_frames_dict.items():
    
    # Check if Currecny column is present in df
    if 'CurrencySymbol' in df:
        print("{} year dataframe has a 'CurrencySymbol' column ".format(year))
        for currency_symbol in df.CurrencySymbol:
            #check for NaN
            if not pd.isna(currency_symbol):
                # redundnat to add symbol to a dict {symbol -> symbol}but helps to treat all currencies the same
                currency_dict[currency_symbol] = currency_symbol
    elif 'Currency' in df:
        print("{} year dataframe has a 'Currency' column ".format(year))
        for currency in df.Currency:
            #check for NaN
            if not pd.isna(currency):
                # add to dict {currency -> currency_symbol}
                add_currencies_to_dict(currency, currency_dict)   
                

2017 year dataframe has a 'Currency' column 


### Preporcess the salary related columns (currency, salary, salary type etc ) for datasets

#### Auxilliary/helper methods for currency conversions

In [45]:

def convert_currency(amount, currency, factor):
    return available_currencies_for_conversion[currency] * amount * factor


currency_converter = CurrencyRates()
date = datetime(2017, 12, 31, 18, 36, 28, 151012)

available_currencies_for_conversion = currency_converter.get_rates('EUR', date)

available_currencies_for_conversion['EUR']  = 1

# Take care of outlier salaries 
# max montly income in euros
max_montly_salary = 20000

# minimum mothly income in euros
min_montly_salary = 0

In [46]:
#Dictionary to hold processed payments/salary dataframes for yeas 2017 - 2021

clean_payment_dataframes_2017_to_2020 ={}

### Process 2017 payments

In [47]:
df_2017 = data_frames_dict[2017]

# annual salary divide by 12
annual_rate = float (1/12)


# for each currency entry, replace it with its currency symbol
df_2017.Currency = df_2017.Currency.apply(lambda x: currency_dict[x])


# Filter out the currencies that we cannot convert
df_2017 = df_2017[df_2017.Currency.apply(lambda c:   c  in available_currencies_for_conversion)]
    
# Covert salary to a euros and divide it by 12  
df_2017['Monthly_Sal_EUR'] = df_2017.apply(lambda row : convert_currency(row['Salary'],
                     row['Currency'], annual_rate), axis = 1)

# drop Currency, Salary columns 
# drop c salaries less than min_salary and greater than max salary 
df_2017 = df_2017.drop(['Currency','Salary' ], axis=1)
df_2017 = df_2017[(df_2017.Monthly_Sal_EUR > min_montly_salary) & (df_2017.Monthly_Sal_EUR < max_montly_salary)]



#drop any NA 
df_2017= df_2017.dropna(thresh=3)

clean_payment_dataframes_2017_to_2020[2017] = df_2017
df_2017

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2017['Monthly_Sal_EUR'] = df_2017.apply(lambda row : convert_currency(row['Salary'],


Unnamed: 0,Country,EdLevel,Monthly_Sal_EUR
2,United Kingdom,Bachelor's degree,8410.201042
14,United Kingdom,Professional degree,7393.583333
17,United States,Bachelor's degree,12992.416667
18,United States,Bachelor's degree,8245.187500
22,Israel,Bachelor's degree,10070.522100
...,...,...,...
51371,Netherlands,Bachelor's degree,6182.795699
51378,Cyprus,Secondary school,8960.573477
51382,France,Master's degree,2688.172043
51387,United States,Bachelor's degree,5796.616667


### Process 2018 payments

In [48]:
df_2018 = data_frames_dict[2018]

print(df_2018.head())

# convert the converted annual USD salary to monthly salary 
df_2018['Monthly_Sal_EUR'] = df_2018.apply(
    lambda row : convert_currency(row['ConvertedSalary'],'USD', annual_rate), axis = 1)

# We are no longer interested in the ConvertedSalary
# Also drop outlier salaries 
df_2018 = df_2018.drop('ConvertedSalary', axis=1)[(df_2018.Monthly_Sal_EUR > min_montly_salary) & (df_2018.Monthly_Sal_EUR < max_montly_salary)]



# drop any NA at this point 
df_2018= df_2018.dropna(thresh=3)

clean_payment_dataframes_2017_to_2020[2018] = df_2018
df_2018

          Country  ConvertedSalary  \
1  United Kingdom          70841.0   
4    South Africa          21426.0   
5  United Kingdom          41671.0   
6   United States         120000.0   
8   United States         250000.0   

                                        EdLevel  
1                             Bachelor's degree  
4  college/university without bachelor's degree  
5                             Bachelor's degree  
6  college/university without bachelor's degree  
8  college/university without bachelor's degree  


Unnamed: 0,Country,EdLevel,Monthly_Sal_EUR
1,United Kingdom,Bachelor's degree,7079.967608
4,South Africa,college/university without bachelor's degree,2141.350150
5,United Kingdom,Bachelor's degree,4164.669192
6,United States,college/university without bachelor's degree,11993.000000
17,Sweden,Master's degree,4787.605600
...,...,...,...
89911,United States,Secondary school,999.416667
89913,Mexico,Professional degree,2577.295700
89932,United Kingdom,Bachelor's degree,9717.628075
89938,India,Bachelor's degree,194.286600


### Covert 2019 - 2021 - these data frames have the same salary related columns 

In [49]:
df_2019_2021 = [data_frames_dict[2019], data_frames_dict[2020], data_frames_dict[2021]]
years = [2019, 2020, 2021]

annual_rate = 1/12

for year in years:
    df = data_frames_dict[year]

    # convert the converted annual USD salary to monthly salary 
    df['Monthly_Sal_EUR'] = df.apply(
        lambda row : convert_currency(row['CompTotal'],'USD', annual_rate), axis = 1)
    
    # drop CompTotal column
    # Also drop outlier salaries 
    df = df.drop('CompTotal', axis=1)[(df.Monthly_Sal_EUR > min_montly_salary) & (df.Monthly_Sal_EUR < max_montly_salary)]
    df= df.dropna(thresh=3)
    print(df.head())
    clean_payment_dataframes_2017_to_2020[year] = df
    


          Country                                       EdLevel  \
2        Thailand                             Bachelor's degree   
3   United States                             Bachelor's degree   
5          Canada                             Bachelor's degree   
8     New Zealand  college/university without bachelor's degree   
12  United States                               Master's degree   

    Monthly_Sal_EUR  
2       2298.658333  
3       6096.441667  
5       3997.666667  
8      13791.950000  
12      8994.750000  
           Country                                       EdLevel  \
7    United States                             Bachelor's degree   
9   United Kingdom                               Master's degree   
10  United Kingdom                             Bachelor's degree   
11           Spain  college/university without bachelor's degree   
12     Netherlands                              Secondary school   

    Monthly_Sal_EUR  
7      11593.233333  
9       2498

In [50]:
import codecs
import json
summary_df = None
i = 0
for year, df in clean_payment_dataframes_2017_to_2020.items():
    df = df.filter(items=['Country','Monthly_Sal_EUR', 'EdLevel'])
    
    
    # compute mean salary per country and reset the resulting df index
    
    
    for country in df.Country.unique():
        df2 = df[df['Country'] == country]
        print(df2.head())
        df2.to_csv("processed_data/salaries/{}-{}-income_education_per_country.csv".format(country,year))
   

            Country  Monthly_Sal_EUR              EdLevel
2    United Kingdom      8410.201042    Bachelor's degree
14   United Kingdom      7393.583333  Professional degree
110  United Kingdom      2402.914583    Bachelor's degree
113  United Kingdom      2218.075000    Bachelor's degree
135  United Kingdom      2402.914583    Bachelor's degree
          Country  Monthly_Sal_EUR  \
17  United States     12992.416667   
18  United States      8245.187500   
25  United States     17489.791667   
70  United States      6496.208333   
72  United States     11993.000000   

                                         EdLevel  
17                             Bachelor's degree  
18                             Bachelor's degree  
25                               Master's degree  
70  college/university without bachelor's degree  
72  college/university without bachelor's degree  
     Country  Monthly_Sal_EUR                                       EdLevel
22    Israel     10070.522100            

      Country  Monthly_Sal_EUR            EdLevel
992      Iran       199.883333    Master's degree
7453     Iran      3997.666667    Master's degree
7792     Iran      1199.300000  Bachelor's degree
9811     Iran       449.737500  Bachelor's degree
11762    Iran      3597.900000  Bachelor's degree
       Country  Monthly_Sal_EUR                                       EdLevel
1007  Bulgaria      2315.860215                             Bachelor's degree
1534  Bulgaria       215.053763  college/university without bachelor's degree
3471  Bulgaria       179.211470                             Bachelor's degree
3643  Bulgaria      2508.960573                             Bachelor's degree
5388  Bulgaria      2798.366667                             Bachelor's degree
       Country  Monthly_Sal_EUR            EdLevel
1077  Pakistan        19.988333  Bachelor's degree
2900  Pakistan       922.461583  Bachelor's degree
3256  Pakistan      1516.374046  Bachelor's degree
4693  Pakistan       183.039

      Country  Monthly_Sal_EUR                                       EdLevel
12996   Malta      1971.326165                             Bachelor's degree
21935   Malta      2164.247312                             Bachelor's degree
21974   Malta      2150.537634                             Bachelor's degree
25817   Malta      1926.523297  college/university without bachelor's degree
26718   Malta      2419.354839  college/university without bachelor's degree
      Country  Monthly_Sal_EUR                                       EdLevel
13080  Cyprus      4032.258065                             Bachelor's degree
18472  Cyprus      1281.362007                             Bachelor's degree
22590  Cyprus      2688.172043                             Bachelor's degree
49625  Cyprus      4435.483871  college/university without bachelor's degree
51378  Cyprus      8960.573477                              Secondary school
         Country  Monthly_Sal_EUR            EdLevel
13240  Nicaragua      6

    Country  Monthly_Sal_EUR            EdLevel
17   Sweden      4787.605600    Master's degree
26   Sweden      7256.864358    Master's degree
116  Sweden      6359.088367  Bachelor's degree
136  Sweden      5535.968800    Master's degree
173  Sweden      5984.507000   Secondary school
       Country  Monthly_Sal_EUR                                       EdLevel
20   Australia      9591.201867                             Bachelor's degree
142  Australia      5994.501167                               Master's degree
143  Australia      9191.535142                             Bachelor's degree
147  Australia      8791.968358  college/university without bachelor's degree
212  Australia      6154.407833                             Bachelor's degree
   Country  Monthly_Sal_EUR                                       EdLevel
24   India      1095.160783                               Master's degree
41   India     12391.167600  college/university without bachelor's degree
45   India       657.2

    Country  Monthly_Sal_EUR                                       EdLevel
179  Canada      7242.672642                             Bachelor's degree
190  Canada      7443.855217                             Bachelor's degree
346  Canada      5230.846892  college/university without bachelor's degree
359  Canada      4345.663550                              Associate degree
365  Canada      4828.481742                             Bachelor's degree
    Country  Monthly_Sal_EUR                                       EdLevel
188   Spain      2935.586575  college/university without bachelor's degree
339   Spain      1056.583300                               Master's degree
507   Spain      3424.900975  college/university without bachelor's degree
735   Spain      3669.458233                             Bachelor's degree
745   Spain      2201.914800                             Bachelor's degree
     Country  Monthly_Sal_EUR            EdLevel
207  Ukraine         1043.391    Master's degree
67

      Country  Monthly_Sal_EUR                                       EdLevel
681   Hungary        3071.4073                               Master's degree
1201  Hungary        1619.0550                              Secondary school
2013  Hungary        4584.9239                              Secondary school
2681  Hungary        2935.8864                             Bachelor's degree
2948  Hungary        1428.3663  college/university without bachelor's degree
                   Country  Monthly_Sal_EUR            EdLevel
725   United Arab Emirates      3337.951725  Bachelor's degree
1389  United Arab Emirates      5996.500000  Bachelor's degree
1863  United Arab Emirates      1599.066667  Bachelor's degree
2081  United Arab Emirates      4244.322700    Master's degree
2670  United Arab Emirates      3997.266900  Bachelor's degree
       Country  Monthly_Sal_EUR                                       EdLevel
764   Malaysia         151.1118                             Bachelor's degree
1392

                      Country  Monthly_Sal_EUR  \
4382   Bosnia and Herzegovina       976.230200   
4524   Bosnia and Herzegovina      3512.549817   
6419   Bosnia and Herzegovina      1174.114700   
9191   Bosnia and Herzegovina      6115.830350   
11443  Bosnia and Herzegovina      1501.523600   

                                            EdLevel  
4382   college/university without bachelor's degree  
4524                              Bachelor's degree  
6419   college/university without bachelor's degree  
9191   college/university without bachelor's degree  
11443                             Bachelor's degree  
          Country  Monthly_Sal_EUR  \
4432   Kazakhstan        1145.3315   
5997   Kazakhstan         911.4680   
6805   Kazakhstan        2031.6142   
8031   Kazakhstan         369.3844   
15235  Kazakhstan         479.7200   

                                            EdLevel  
4432                              Bachelor's degree  
5997   college/university without bach

          Country  Monthly_Sal_EUR            EdLevel
26350  Tajikistan          419.755  Bachelor's degree
61006  Tajikistan         5396.850    Master's degree
       Country  Monthly_Sal_EUR              EdLevel
27767  Somalia           959.44    Bachelor's degree
39928  Somalia          3597.90  Professional degree
49209  Somalia          1559.09    Bachelor's degree
76486  Somalia           239.86     Secondary school
       Country  Monthly_Sal_EUR                                       EdLevel
32368  Jamaica      1302.439800                             Bachelor's degree
76201  Jamaica       347.797000  college/university without bachelor's degree
81392  Jamaica     19988.333333                     Primary/elementary school
      Country  Monthly_Sal_EUR            EdLevel
33738  Bhutan         470.1256  Bachelor's degree
52039  Bhutan         299.8250  Bachelor's degree
      Country  Monthly_Sal_EUR            EdLevel
35256    Iraq         507.3039   Secondary school
40019    Ir

    Country  Monthly_Sal_EUR            EdLevel
117   Italy      3098.191667  Bachelor's degree
313   Italy      2298.658333  Bachelor's degree
376   Italy      2378.611667  Bachelor's degree
607   Italy       159.906667   Secondary school
676   Italy      2998.250000    Master's degree
      Country  Monthly_Sal_EUR                                       EdLevel
121   Estonia       279.836667                               Master's degree
599   Estonia       149.912500  college/university without bachelor's degree
1315  Estonia       299.825000                             Bachelor's degree
2160  Estonia       289.830833                              Secondary school
2221  Estonia       249.854167  college/university without bachelor's degree
    Country  Monthly_Sal_EUR            EdLevel
122  Turkey       799.533333  Bachelor's degree
632  Turkey       999.416667    Master's degree
696  Turkey       999.416667  Bachelor's degree
715  Turkey     15990.666667  Bachelor's degree
800  Turke

       Country  Monthly_Sal_EUR            EdLevel
1279  Slovakia       159.906667  Bachelor's degree
2850  Slovakia       499.708333    Master's degree
4483  Slovakia       299.825000  Bachelor's degree
4723  Slovakia      4497.375000  Bachelor's degree
4846  Slovakia       259.848333    Master's degree
     Country  Monthly_Sal_EUR                                       EdLevel
1384   Kenya      4997.083333                             Bachelor's degree
1561   Kenya      5596.733333                             Bachelor's degree
2616   Kenya      2998.250000                             Bachelor's degree
4377   Kenya      4497.375000                             Bachelor's degree
5285   Kenya      5996.500000  college/university without bachelor's degree
                                   Country  Monthly_Sal_EUR  \
1431  Venezuela, Bolivarian Republic of...      1798.950000   
1783  Venezuela, Bolivarian Republic of...        11.993000   
4236  Venezuela, Bolivarian Republic of...       

                      Country  Monthly_Sal_EUR            EdLevel
5993   Libyan Arab Jamahiriya       139.918333  Bachelor's degree
16568  Libyan Arab Jamahiriya       649.620833  Bachelor's degree
32819  Libyan Arab Jamahiriya      2498.541667    Master's degree
59431  Libyan Arab Jamahiriya       499.708333  Bachelor's degree
      Country  Monthly_Sal_EUR            EdLevel
6641     Cuba      1249.270833  Bachelor's degree
8347     Cuba        29.982500  Bachelor's degree
9587     Cuba       234.862917  Bachelor's degree
18666    Cuba       749.562500  Bachelor's degree
21345    Cuba        49.970833  Bachelor's degree
          Country  Monthly_Sal_EUR  \
6673   Montenegro      5996.500000   
24633  Montenegro      1998.833333   
35941  Montenegro       119.930000   
39485  Montenegro       199.883333   
54348  Montenegro        29.982500   

                                            EdLevel  
6673                                Master's degree  
24633  college/university without

       Country  Monthly_Sal_EUR                    EdLevel
51570  Andorra        14991.250        No formal education
82601  Andorra          299.825  Primary/elementary school
      Country  Monthly_Sal_EUR                                       EdLevel
52368   Haiti       1499.12500  college/university without bachelor's degree
65470   Haiti      10089.11125                             Bachelor's degree
        Country  Monthly_Sal_EUR                                       EdLevel
58912  Maldives      1699.008333                             Bachelor's degree
66514  Maldives      2398.600000                             Bachelor's degree
72876  Maldives      1099.358333  college/university without bachelor's degree
73510  Maldives      1898.891667                             Bachelor's degree
80986  Maldives      3398.016667                             Bachelor's degree
                                Country  Monthly_Sal_EUR            EdLevel
60602  Saint Vincent and the Grenadines   

      Country  Monthly_Sal_EUR            EdLevel
149   Ukraine      1998.833333  Bachelor's degree
641   Ukraine      5996.500000  Bachelor's degree
1535  Ukraine      9994.166667    Master's degree
1762  Ukraine     11993.000000  Bachelor's degree
1772  Ukraine       949.445833  Bachelor's degree
          Country  Monthly_Sal_EUR            EdLevel
150   Switzerland       999.416667    Master's degree
339   Switzerland     13991.833333   Secondary school
654   Switzerland      7495.625000  Bachelor's degree
950   Switzerland     11993.000000  Bachelor's degree
1378  Switzerland      9994.166667  Bachelor's degree
       Country  Monthly_Sal_EUR                                       EdLevel
184   Slovenia       219.871667  college/university without bachelor's degree
2485  Slovenia       349.795833                               Master's degree
2866  Slovenia       299.825000  college/university without bachelor's degree
2879  Slovenia        49.970833                              Sec

     Country  Monthly_Sal_EUR            EdLevel
805    Nepal      4997.083333  Bachelor's degree
1349   Nepal      4997.083333  Bachelor's degree
1813   Nepal     17989.500000  Bachelor's degree
5766   Nepal     12492.708333    Master's degree
5969   Nepal      6995.916667  Bachelor's degree
                                    Country  Monthly_Sal_EUR  \
820    Venezuela, Bolivarian Republic of...        29.982500   
4192   Venezuela, Bolivarian Republic of...       119.930000   
5251   Venezuela, Bolivarian Republic of...        99.941667   
5591   Venezuela, Bolivarian Republic of...         7.995333   
14906  Venezuela, Bolivarian Republic of...        29.982500   

                                            EdLevel  
820                                Secondary school  
4192                              Bachelor's degree  
5251   college/university without bachelor's degree  
5591   college/university without bachelor's degree  
14906                             Bachelor's degree

         Country  Monthly_Sal_EUR  \
5888   Nicaragua        69.959167   
16488  Nicaragua       169.900833   
16801  Nicaragua        79.953333   
18074  Nicaragua       299.825000   
18078  Nicaragua      8195.216667   

                                            EdLevel  
5888                            Professional degree  
16488  college/university without bachelor's degree  
16801                             Bachelor's degree  
18074  college/university without bachelor's degree  
18078                             Bachelor's degree  
        Country  Monthly_Sal_EUR            EdLevel
6009   Maldives      1699.008333  Bachelor's degree
13309  Maldives      2998.250000  Bachelor's degree
40333  Maldives      1798.950000  Bachelor's degree
        Country  Monthly_Sal_EUR            EdLevel
6056   Cambodia        39.976667  Bachelor's degree
20024  Cambodia        89.947500  Bachelor's degree
21799  Cambodia       231.764725  Bachelor's degree
40854  Cambodia        64.962083  Bac

      Country  Monthly_Sal_EUR            EdLevel
36321    Fiji       4362.45375  Bachelor's degree
       Country  Monthly_Sal_EUR            EdLevel
39464  Grenada       349.795833  Bachelor's degree
          Country  Monthly_Sal_EUR            EdLevel
40821  Mauritania        19.988333    Master's degree
63721  Mauritania      3497.958333  Bachelor's degree
        Country  Monthly_Sal_EUR            EdLevel
42573  Barbados        535.48745  Bachelor's degree
60455  Barbados        299.82500  Bachelor's degree
      Country  Monthly_Sal_EUR                                       EdLevel
44910   Haiti        19.988333  college/university without bachelor's degree
58910   Haiti        69.959167                             Bachelor's degree
      Country  Monthly_Sal_EUR           EdLevel
48750    Togo          29.9825  Secondary school
      Country  Monthly_Sal_EUR            EdLevel
49671  Monaco         190.6887  Bachelor's degree
      Country  Monthly_Sal_EUR           EdLevel
51

       Country  Monthly_Sal_EUR                                       EdLevel
436    Bolivia       999.416667                           Professional degree
631    Bolivia       299.825000                             Bachelor's degree
8754   Bolivia      1049.387500  college/university without bachelor's degree
9495   Bolivia       699.591667                              Secondary school
10660  Bolivia       149.912500                             Bachelor's degree
           Country  Monthly_Sal_EUR            EdLevel
470   South Africa      5296.908333  Bachelor's degree
777   South Africa      3897.725000  Bachelor's degree
1001  South Africa      4497.375000   Secondary school
1195  South Africa      3997.666667  Bachelor's degree
1581  South Africa         0.099942  Bachelor's degree
     Country  Monthly_Sal_EUR            EdLevel
580   Serbia       399.766667  Bachelor's degree
1290  Serbia       199.883333  Bachelor's degree
1553  Serbia       599.650000    Master's degree
2633  

        Country  Monthly_Sal_EUR                                       EdLevel
3298   Cameroon     14991.250000                               Master's degree
27139  Cameroon       199.883333  college/university without bachelor's degree
32349  Cameroon        19.988333                             Bachelor's degree
33942  Cameroon     12992.416667                           Professional degree
35179  Cameroon      6995.916667                             Bachelor's degree
     Country  Monthly_Sal_EUR            EdLevel
3507  Norway      2998.250000   Secondary school
3899  Norway     11993.000000  Bachelor's degree
6945  Norway      3997.666667  Bachelor's degree
7589  Norway      5196.966667  Bachelor's degree
9419  Norway      7422.567642  Bachelor's degree
         Country  Monthly_Sal_EUR  \
3614  Azerbaijan       411.659725   
5120  Azerbaijan       329.807500   
7284  Azerbaijan       199.883333   
8059  Azerbaijan       309.819167   
8444  Azerbaijan       599.650000   

         

      Country  Monthly_Sal_EUR            EdLevel
19668  Belize       299.825000  Bachelor's degree
66772  Belize      4557.340000  Bachelor's degree
79999  Belize      4597.316667    Master's degree
      Country  Monthly_Sal_EUR            EdLevel
19974   Benin        39.976667    Master's degree
74063   Benin     14991.250000  Bachelor's degree
83174   Benin        21.987167  Bachelor's degree
83435   Benin     19988.333333  Bachelor's degree
      Country  Monthly_Sal_EUR            EdLevel
20639  Bhutan       199.883333  Bachelor's degree
59424  Bhutan      2998.250000  Bachelor's degree
         Country  Monthly_Sal_EUR            EdLevel
21262  Palestine       269.842500  Bachelor's degree
28176  Palestine       539.685000  Bachelor's degree
32230  Palestine       149.912500  Bachelor's degree
33543  Palestine       469.725833  Bachelor's degree
39570  Palestine      2298.658333   Associate degree
                 Country  Monthly_Sal_EUR           EdLevel
22032  Republic of Kor

### Compute a summary pdf - average of montly salaries per country 

In [None]:
import codecs
import json
summary_df = None
i = 0
for year, df in clean_payment_dataframes_2017_to_2020.items():
    df = df.filter(items=['Country','Monthly_Sal_EUR'])
    
    df2 = df.copy(deep=True)
    
    # compute mean salary per country and reset the resulting df index
    df =  df.groupby('Country').mean().round(2).reset_index()
    
    # add column names 
    df.columns = ['Country', year]
    
    # merge the resulting dataframes into a single df columns = years, rows = countries
    if i > 0:
        summary_df = pd.DataFrame.merge(summary_df,df,on='Country')
        i+=1 
    else:
        summary_df =  df
        i+=1
        
    df2.columns = ['Country', 'respondentsCount']

    df2 = df2.groupby(by=['Country'] , as_index=False).count()
    
    print(df2.shape)
        
    print(df2.head())
    
    json_dict2 = {}
    for index, row in df2.iterrows():
        json_dict2[row["Country"]] = row["respondentsCount"]
  

    
  
    with codecs.open("processed_data/salaries/{}-income_respondents_count_per_country.json".format(year), "w", encoding='utf-8') as outfile:
        json.dump(json_dict2, outfile, indent = 4, ensure_ascii=False)

# Transpose the summary_df, now  columns = countries and rows = years 
summary_df = summary_df.T

# Pick all the countries -> first row of the transposed matrix
column_names = summary_df.iloc[0]
column_names = list(column_names) 

# drop the first row containing countries' names, instead make the countries the column names 
summary_df = summary_df.drop('Country')
summary_df.columns = column_names

# You can finally do some simple plots 
summary_df.plot( xticks = [2017, 2018, 2019, 2020, 2021], title= "A dirty plot of the world :P ")
summary_df[['Uzbekistan','France'] ].plot( xticks = [2017, 2018, 2019, 2020, 2021], title = "Uzbekistan vs. France ")
summary_df.to_json(path_or_buf = 'processed_data/salaries/average_montly_salary_in_Euros_per_country_per_year.json', index=True, indent = 4)


In [None]:
summary_df

In [None]:
df = clean_payment_dataframes_2017_to_2020[2017]
df[df.Monthly_Sal_EUR > max_montly_salary]

In [None]:
summary_df = None
i = 0
for year, df in clean_payment_dataframes_2017_to_2020.items():
    df = df.filter(items=['Country', 'YearsCodePro'] )
    print(df.head())
    # compute mean salary per country and reset the resulting df index
    df =  df.groupby('Country').mean().astype(int).reset_index()
    print(df.head())

    # add column names 
    df.columns = ['Country', year]
    
    # merge the resulting dataframes into a single df columns = years, rows = countries
    if i > 0:
        summary_df = pd.DataFrame.merge(summary_df,df,on='Country')
        i+=1 
    else:
        summary_df =  df
        i+=1

# Transpose the summary_df, now  columns = countries and rows = years 
summary_df = summary_df.T

# Pick all the countries -> first row of the transposed matrix
column_names = summary_df.iloc[0]
column_names = list(column_names) 

# drop the first row containing countries' names, instead make the countries the column names 
summary_df = summary_df.drop('Country')
summary_df.columns = column_names

# You can finally do some simple plots 
summary_df.plot( xticks = [2017, 2018, 2019, 2020, 2021], title= "A dirty plot of the world :P ")
summary_df[['Uzbekistan','France'] ].plot( xticks = [2017, 2018, 2019, 2020, 2021], title = "Uzbekistan vs. France ")

summary_df.to_json(path_or_buf = 'processed_data/experience/average_experience_per_country_per_year.json', index=True, indent=4)

In [None]:
summary_df
