### Import libraries and dependencies

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

pd.options.display.float_format = '{:,.0f}'.format

In [2]:
def excel_parser(file_path):
    '''
    This function takes the file path and read multiple sheets in it while parsing them through the necessary 
    removal from the excel cells. Then it writes each sheet as a dataframe and collects it in dictionary.
    '''
    xl = pd.ExcelFile(file_path)
    xl_sheets = xl.sheet_names[1:-2]

    all_sheets_df = []
    for sheet in xl_sheets:
        sheets_dict = {}
        df_name = 'df_'+sheet.replace(" ","")
        
        df = xl.parse(sheet, skiprows=14, convert_float=False, header=None)
        
        for i in df.iloc[:1].columns:
            if type(df.iloc[:1][i][0])==str:
                
                # replace "International migrant stock at mid-year by age (both sexes)" by 'both_sexes' in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('International migrant stock at mid-year by age (both sexes)','both_sexes')
                
                # replace "International migrant stock at mid-year by age " by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('International migrant stock at mid-year by age ','')
                
                # replace "Total population of both sexes at mid-year (thousands)g" by "both_sexes" in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Total population of both sexes at mid-year (thousands)g','both_sexes')
                
                # replace "Total female population at mid-year (thousands)" by "(female)" in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Total female population at mid-year (thousands)','female')
                
                # replace "Total male population at mid-year (thousands)" by "(male)" in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Total male population at mid-year (thousands)','male')
                
                # replace "International migrant stock as percentage of the total population " by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('International migrant stock as percentage of the total population ','')
                
                # replace "Percentage distribution of the international migrant stock " by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Percentage distribution of the international migrant stock ','')
                
                # replace "Female migrants as a percentage of the international migrant stock" by "FemaleMigr_%_IMS" in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Female migrants as a percentage of the international migrant stock','FemaleMigr_%_IMS')
                
                # replace "International migrant stock at mid-year " by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('International migrant stock at mid-year ','')

                # replace "Annual rate of change of the migrant stock " by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Annual rate of change of the migrant stock ','')

                # replace "Estimated refugee stock (including asylum seekers) at mid-year (both sexes)" by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Estimated refugee stock (including asylum seekers) at mid-year (both sexes)','both_sexes')

                # replace "Refugees and asylum seekers as a percentage of the international migrant stock" by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Refugees and asylum seekers as a percentage of the international migrant stock','RefAsylum_%_IMS')

                # replace "Annual rate of change of the refugee stock (including asylum seekers)" by empty in header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('Annual rate of change of the refugee stock (including asylum seekers)','ARchange_ofRefAsylum')

                # remove "(", ")" from the row that is going to be header
                df.iloc[:1][i][0] = df.iloc[:1][i][0].replace('(','').replace(')','')

        #Then first forward fill the NaNs in the first two rows (thus propagating 'Auto loan', for example).
        df.iloc[0:2] = df.iloc[0:2].fillna(method='ffill', axis=1)
        #Next fill in the remaining NaNs with empty strings:
        df.iloc[0:2] = df.iloc[0:2].fillna('')

        #Now join the two rows together with '_' and assign that as the column level values:
        df.columns = df.iloc[0:2].apply(lambda x: '_'.join([str(y) for y in x if y]), axis=0)

        #And finally, remove the first two rows:
        df = df.iloc[2:]
        
        #Remove the rows woth all NaN values
        df = df.dropna(how='all')
        
        #Convert year from float to integer
        if df.columns.str.contains("Year").any()==True:
            df['Year'] = df['Year'].astype(int)
        
        sheets_dict = {df_name: df}
        all_sheets_df.append(sheets_dict)
    return(all_sheets_df)

## I. Read Migrant Stock Datasets

### i. Reading dataset of _UN Migrant Stock By Age And Sex 2019_

In [3]:
#Read the file
file_path_AgeSex = "./data/UN_data/UN_MigrantStockByAgeAndSex_2019.xlsx"
MigrStock_ageSex = excel_parser(file_path_AgeSex)

df_Table1_AgeSex = MigrStock_ageSex[0]['df_Table1'].reset_index(drop=True)
df_Table2_AgeSex = MigrStock_ageSex[1]['df_Table2'].reset_index(drop=True)
df_Table3_AgeSex = MigrStock_ageSex[2]['df_Table3'].reset_index(drop=True)
df_Table4_AgeSex = MigrStock_ageSex[3]['df_Table4'].reset_index(drop=True)
df_Table5_AgeSex = MigrStock_ageSex[4]['df_Table5'].reset_index(drop=True)

df_ANNEX_AgeSex = pd.ExcelFile(file_path_AgeSex).parse('ANNEX', skiprows=15, convert_float=False).drop(['Index'], axis=1)


In [4]:
df_Table1_AgeSex.head()

Unnamed: 0,Year,Sort\norder,"Major area, region, country or area of destination",Notes,Code,Type of data a,both_sexes_0-4,both_sexes_5-9,both_sexes_10-14,both_sexes_ 15-19,...,female_35-39,female_40-44,female_45-49,female_50-54,female_55-59,female_60-64,female_65-69,female_70-74,female_75+,female_Total
0,1990,1990001,WORLD,,900.0,,4767961,6176471,7729769,9753361,...,6863925,6193801,5202241,4431846,3993850,3702649,2977746,2543862,4929603,75349784
1,1990,1990002,UN development groups,,,,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
2,1990,1990003,More developed regions,b,901.0,,1218097,2336625,3515354,4778797,...,4114643,3845971,3278864,2791841,2524115,2334839,1825701,1567961,3036174,42340418
3,1990,1990004,Less developed regions,c,902.0,,3549864,3839846,4214415,4974564,...,2749282,2347830,1923377,1640005,1469735,1367810,1152045,975901,1893429,33009366
4,1990,1990005,Least developed countries,d,941.0,,1016860,1130722,1102576,1206354,...,370929,289525,221254,190213,156055,141991,78103,51194,70448,5509988


### ii. Reading dataset of _UN Migrant Stock By Origin And Destination 2019_

In [5]:
#Read the file
file_path_OriginDestination = "./data/UN_data/UN_MigrantStockByOriginAndDestination_2019.xlsx"
MigrStock_originDestination = excel_parser(file_path_OriginDestination)

df_Table1_OriginDest = MigrStock_originDestination[0]['df_Table1'].reset_index(drop=True)
df_Table2_OriginDest = MigrStock_originDestination[1]['df_Table2'].reset_index(drop=True)
df_Table3_OriginDest = MigrStock_originDestination[2]['df_Table3'].reset_index(drop=True)

df_ANNEX_OriginDest = pd.ExcelFile(file_path_OriginDestination).parse('ANNEX', skiprows=15, convert_float=False).drop(['Index'], axis=1)


In [6]:
df_Table1_OriginDest.head()

Unnamed: 0,Year,Sort\norder,"Major area, region, country or area of destination",Notes,Code,Type of data a,Type of data a_Total,Type of data a_Other South,Type of data a_Other North,Country or area of origin_Afghanistan,...,Country or area of origin_Uruguay,Country or area of origin_Uzbekistan,Country or area of origin_Vanuatu,Country or area of origin_Venezuela (Bolivarian Republic of),Country or area of origin_Viet Nam,Country or area of origin_Wallis and Futuna Islands,Country or area of origin_Western Sahara,Country or area of origin_Yemen,Country or area of origin_Zambia,Country or area of origin_Zimbabwe
0,1990,1990001,WORLD,,900.0,,153011473,6548526,2366800,6823350,...,237486,1428020,5060,185946,1237873,6484,168239,455492,85203,204365
1,1990,1990002,UN development groups,,,,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
2,1990,1990003,More developed regions,b,901.0,,82767216,3385103,1077179,119386,...,56838,1078563,1017,114991,1085310,884,333,11457,26062,40957
3,1990,1990004,Less developed regions,c,902.0,,70244257,3163423,1289621,6703964,...,180648,349457,4043,70955,152563,5600,167906,444035,59141,163408
4,1990,1990005,Least developed countries,d,941.0,,11060221,482753,239756,0,...,286,2027,9,2510,71579,0,0,357,26254,75122


### iii. Reading dataset of _UN Migrant Stock Total 2019_

In [7]:
#Read the file
file_path_StockTotal = "./data/UN_data/UN_MigrantStockTotal_2019.xlsx"
MigrStock_StockTotal = excel_parser(file_path_StockTotal)

df_Table1_StockTotal = MigrStock_StockTotal[0]['df_Table1'].reset_index(drop=True)
df_Table2_StockTotal = MigrStock_StockTotal[1]['df_Table2'].reset_index(drop=True)
df_Table3_StockTotal = MigrStock_StockTotal[2]['df_Table3'].reset_index(drop=True)
df_Table4_StockTotal = MigrStock_StockTotal[3]['df_Table4'].reset_index(drop=True)
df_Table5_StockTotal = MigrStock_StockTotal[4]['df_Table5'].reset_index(drop=True)
df_Table6_StockTotal = MigrStock_StockTotal[5]['df_Table6'].reset_index(drop=True)

df_ANNEX_StockTotal = pd.ExcelFile(file_path_StockTotal).parse('ANNEX', skiprows=15, convert_float=False).drop(['Index'], axis=1)


In [8]:
df_Table1_StockTotal.head()

Unnamed: 0,Sort\norder,"Major area, region, country or area of destination",Notes,Code,Type of data a,both sexes_1990.0,both sexes_1995.0,both sexes_2000.0,both sexes_2005.0,both sexes_2010.0,...,male_2010.0,male_2015.0,male_2019.0,female_1990.0,female_1995.0,female_2000.0,female_2005.0,female_2010.0,female_2015.0,female_2019.0
0,1,WORLD,,900.0,,153011473,161316895,173588441,191615574,220781909,...,114061680,128863389,141488004,75349784,79630779,85559220,93754736,106720229,119997907,130154101
1,2,UN development groups,,,,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
2,3,More developed regions,b,901.0,,82767216,92935095,103961989,116687616,130613460,...,63408858,67824389,73765353,42340418,47557507,53160091,59609215,67204602,72818928,78303908
3,4,Less developed regions,c,902.0,,70244257,68381800,69626452,74927958,90168449,...,50652822,61039000,67722651,33009366,32073272,32399129,34145521,39515627,47178979,51850193
4,5,Least developed countries,d,941.0,,11060221,11681777,10063948,9833150,10432671,...,5185496,6784461,8086158,5509988,5857700,5030016,4845613,5247175,6846888,8202865


## II. Read Migrant Flow Datasets

### i. migflows_allcountries_1990_2015.csv Data Details:

**Row for each migration corridor - period combination (200 origins x 200 destinations x 5 periods = 200,000).**

- year0 - first year of five year period
- orig - origin ISO three letter country code
- dest - destination ISO three letter country

**Columns for estimates based on the following migration flow estimation methods:**

Stock Differencing Approaches: 
- sd_drop_neg - https://doi.org/10.1016/j.jdeveco.2009.11.004
- sd_rev_neg - https://doi.org/10.1111/sjoe.12098

Migration Rate Approach:
- mig_rate - https://doi.org/10.1002/9781118937464.ch7]

Demographic Accounting Approaches:
- da_min_open - https://doi.org/10.4054/DemRes.2013.28.18
- da_min_closed - https://doi.org/10.1111/imre.12327
- da_pb_closed - https://doi.org/10.1073/PNAS.1722334116

In [9]:
#Read the file
df_MigrFlows = pd.read_csv("./data/Abel/migflows_allcountries_1990_2015.csv")
df_MigrFlows.head()

Unnamed: 0,year0,orig,dest,sd_drop_neg,sd_rev_neg,mig_rate,da_min_open,da_min_closed,da_pb_closed
0,1990,BDI,BDI,0,0,0,0,0,0
1,1990,COM,BDI,0,0,0,0,0,0
2,1990,DJI,BDI,0,0,0,0,0,0
3,1990,ERI,BDI,0,0,0,0,0,83
4,1990,ETH,BDI,0,0,0,0,0,2


### ii. migflows_allcountries_gender_separated_1990_2015.csv Data Details

**Column details:**
- stock: stock data source used for the estimated flow (un12, un13, un15 or wb11)
- demo: demographic data source used for the estimated flow (wpp2010, wpp2012, wpp2015)
- sex: gender of the estimated flow
- year0: first year of the period of the estimated flow (ranging from 1960 to 2010, and varying in lengths depending on the stock and demo data, as described in the paper)
- interval: the length of the period of the estimated flow 
- orig: ISO 3166-1 alpha-3 letter code for the origin country of the estimated flow
- dest: ISO 3166-1 alpha-3 letter code for the destination country of the estimated flow
- orig_code: ISO 3166-1 numeric code for the origin country of the estimated flow
- dest_code: ISO 3166-1 numeric code for the destination country of the estimated flow
- flow: estimated flow

More details on the country codes:
- https://www.iso.org/iso-3166-country-codes.html
- https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
- https://en.wikipedia.org/wiki/ISO_3166-1_numeric

More details on tidy data:
- http://vita.had.co.nz/papers/tidy-data.pdf

In [10]:
#Read the file
df_MigrFlowSex = pd.read_csv("./data/Abel/migflows_allcountries_gender_separated_1990_2015.csv")
df_MigrFlowSex.head()

Unnamed: 0,stock,demo,sex,year0,interval,orig,dest,orig_code,dest_code,flow
0,un12,wpp2010,b,1990,10,ABW,ABW,533,533,0
1,un12,wpp2010,b,1990,10,ABW,AFG,533,4,0
2,un12,wpp2010,b,1990,10,ABW,AGO,533,24,0
3,un12,wpp2010,b,1990,10,ABW,ALB,533,8,0
4,un12,wpp2010,b,1990,10,ABW,ANT,533,530,0


## III. Read Refugee population by country or territory of asylum Datasets

Useful information about some terms in the Refugee-Asylum Dataset:

- Refugees are people who are recognized as refugees under the 1951 Convention Relating to the Status of Refugees or its 1967 Protocol, the 1969 Organization of African Unity Convention Governing the Specific Aspects of Refugee Problems in Africa, people recognized as refugees in accordance with the UNHCR statute, people granted refugee-like humanitarian status, and people provided temporary protection. 

- Asylum seekers--people who have applied for asylum or refugee status and who have not yet received a decision or who are registered as asylum seekers--are excluded. 

- Palestinian refugees are people (and their descendants) whose residence was Palestine between June 1946 and May 1948 and who lost their homes and means of livelihood as a result of the 1948 Arab-Israeli conflict. Country of asylum is the country where an asylum claim was filed and granted.


__Source:__ Data before 2018 are from United Nations High Commissioner for Refugees ( UNHCR ), Statistics Database, Statistical Yearbook and data files, complemented by statistics on Palestinian refugees under the mandate of the UNRWA as published on its website. Data from UNHCR are available online at: unhcr.org/en-us/figures-at-a-glance.html. The data for 2018 are from UNHCR Global Trends report 2018 and World Bank's estimates ( for Palestinian refugees in the UNRWA areas of operation ).


In [11]:
#Read the file
file_path_Refugee = "./data/TheWorldBankRefugee/API_SM.POP.REFG_DS2_en_excel_v2_890428.xls"
df_Refugee = pd.ExcelFile(file_path_Refugee).parse('Data', skiprows=3)

In [12]:
#Remove all the NaN columns that have no value for the further analysis from the data 
# find all the columns which have all NaN values in it
nan_columns = df_Refugee.columns[df_Refugee.isnull().all()]#.tolist()
df_Refugee = df_Refugee.drop(nan_columns, axis=1)

df_Refugee.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1990,1991,1992,1993,1994,1995,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,ABW,Refugee population by country or territory of ...,SM.POP.REFG,,,,,,,...,,,,,1.0,,2.0,1.0,,
1,Afghanistan,AFG,Refugee population by country or territory of ...,SM.POP.REFG,50.0,38.0,60025.0,32132.0,19131.0,19605.0,...,37.0,6434.0,3009.0,16187.0,16863.0,300423.0,257554.0,59771.0,75927.0,72231.0
2,Angola,AGO,Refugee population by country or territory of ...,SM.POP.REFG,11557.0,11022.0,11002.0,10878.0,10686.0,10884.0,...,14734.0,15155.0,16223.0,23413.0,23783.0,15474.0,15555.0,15555.0,41109.0,39865.0
3,Albania,ALB,Refugee population by country or territory of ...,SM.POP.REFG,,,3000.0,3000.0,3000.0,4720.0,...,70.0,76.0,82.0,86.0,93.0,104.0,104.0,138.0,89.0,131.0
4,Andorra,AND,Refugee population by country or territory of ...,SM.POP.REFG,,,,,,,...,,,,,,,,,,
