In [295]:
import pandas as pd
import altair as alt
import glob
import os
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [296]:
# create list of filenames
folder_path = '../data/data-unified/AA-NationaleVisa/tabula/'
files_AA_dvisas = glob.glob(os.path.join(folder_path, '*.csv*'))
files_AA_dvisas

['../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2019.csv',
 '../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2024.csv',
 '../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2023.csv',
 '../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2022.csv',
 '../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2020.csv',
 '../data/data-unified/AA-NationaleVisa/tabula/tabula-AA_nationalvisa-2021.csv']

In [297]:
def read_csv_df(filename):
    df_year = pd.read_csv(filename, delimiter=';') #read in one file
    year_str= re.sub(r"(.*)(\d\d\d\d)(.*)",r"\2",filename) #grab year from filename
    df_year['year']=year_str #save year as a separate column to df
    return df_year

In [298]:
AA_DVISA_DATA=pd.DataFrame() # create empty dataframe

In [299]:
for file in files_AA_dvisas: #read each file
    df_yearly = read_csv_df(file) # into a df
    AA_DVISA_DATA=pd.concat([AA_DVISA_DATA,df_yearly]) #concat to empty dataframe

In [300]:
AA_DVISA_DATA.head() #inspect dataframe

Unnamed: 0,Land,Auslandsvertretung,Ehegattennachzug,Kindernachzug,Sonst_Familiennachzug,Studium,Sprachkurs_Schulbesuch,Erwerbstätigkeit,JüdischeZuwanderung,Spätaussiedler,HumanitäreAufnahmeResettlement,SonstigeAufenthaltszwecke,Gesamt,year,Elternnachzug
0,Afghanistan,Kabul,356,400,81,34,17,6,0,0,0,13,907,2019,
1,Ägypten,Kairo,1097,801,113,2394,46,1057,0,0,1107,101,6716,2019,
2,Albanien,Tirana,1343,1181,43,498,243,4914,0,0,0,38,8260,2019,
3,Algerien,Algier,300,55,34,33,145,155,0,0,0,85,807,2019,
4,Angola,Luanda,8,8,5,5,4,13,0,0,0,16,59,2019,


In [301]:
REGIONEN = pd.read_csv('../data/Welt_Laender_Code.csv') # reading in German country names + ISO2 codes
REGIONEN.head()

Unnamed: 0,Countryname_German,Country_Code,Region
0,Belgien,BE,EU
1,Bulgarien,BG,EU
2,Bosnien und Herzegowina,BA,EU
3,Montenegro,ME,EU
4,Tschechische Republik,CZ,EU


In [302]:
# merging ISO2 codes onto data
AA_DVISA_DATA_coded = AA_DVISA_DATA.merge(REGIONEN[['Countryname_German','Country_Code']], left_on='Land', right_on='Countryname_German')
AA_DVISA_DATA_coded.head()

Unnamed: 0,Land,Auslandsvertretung,Ehegattennachzug,Kindernachzug,Sonst_Familiennachzug,Studium,Sprachkurs_Schulbesuch,Erwerbstätigkeit,JüdischeZuwanderung,Spätaussiedler,HumanitäreAufnahmeResettlement,SonstigeAufenthaltszwecke,Gesamt,year,Elternnachzug,Countryname_German,Country_Code
0,Afghanistan,Kabul,356,400,81,34,17,6,0,0,0,13,907,2019,,Afghanistan,AF
1,Ägypten,Kairo,1097,801,113,2394,46,1057,0,0,1107,101,6716,2019,,Ägypten,EG
2,Albanien,Tirana,1343,1181,43,498,243,4914,0,0,0,38,8260,2019,,Albanien,AL
3,Algerien,Algier,300,55,34,33,145,155,0,0,0,85,807,2019,,Algerien,DZ
4,Angola,Luanda,8,8,5,5,4,13,0,0,0,16,59,2019,,Angola,AO


In [303]:
# check length is identical to pre-merge
if len(AA_DVISA_DATA) != len(AA_DVISA_DATA_coded):
    print('mismatch')
    before_merge_country_list = AA_DVISA_DATA.Land.unique()
    after_merge_country_list = AA_DVISA_DATA_coded.Land.unique()
    difference = list(set(before_merge_country_list) - set(after_merge_country_list))
    difference # see the mismatch between both lists
else:
    print(f'Identical lengths: {len(AA_DVISA_DATA)} and {len(AA_DVISA_DATA_coded)}')

Identical lengths: 1036 and 1036


In [304]:
# reading in ISO3 codes and English country names
REGIONS = pd.read_csv('../data/country-and-continent-codes-list.csv', delimiter=';')
REGIONS = REGIONS.drop_duplicates(subset='ISO-alpha2 code')
REGIONS.head()

Unnamed: 0,continent,region,subregion,country,M49 code,ISO-alpha3 code,ISO-alpha2 code,Other groupings
0,Asia,Southern Asia,Southern Asia,Afghanistan,4,AFG,AF,LDC LLDC
1,Europe,Southern Europe,Southern Europe,Albania,8,ALB,AL,
2,Americas,Northern America,Northern America,Antarctica,10,ATA,AQ,
3,Africa,Northern Africa,Northern Africa,Algeria,12,DZA,DZ,
4,Oceania,Polynesia,Polynesia,American Samoa,16,ASM,AS,SIDS


In [305]:
#merging English country names and ISO codes to dataframe
AA_DVISA_DATA_regionalized = AA_DVISA_DATA_coded.merge(REGIONS[['continent','region','country','ISO-alpha2 code','ISO-alpha3 code']], left_on='Country_Code', right_on='ISO-alpha2 code', how='left')
AA_DVISA_DATA_regionalized.head()

Unnamed: 0,Land,Auslandsvertretung,Ehegattennachzug,Kindernachzug,Sonst_Familiennachzug,Studium,Sprachkurs_Schulbesuch,Erwerbstätigkeit,JüdischeZuwanderung,Spätaussiedler,HumanitäreAufnahmeResettlement,SonstigeAufenthaltszwecke,Gesamt,year,Elternnachzug,Countryname_German,Country_Code,continent,region,country,ISO-alpha2 code,ISO-alpha3 code
0,Afghanistan,Kabul,356,400,81,34,17,6,0,0,0,13,907,2019,,Afghanistan,AF,Asia,Southern Asia,Afghanistan,AF,AFG
1,Ägypten,Kairo,1097,801,113,2394,46,1057,0,0,1107,101,6716,2019,,Ägypten,EG,Africa,Northern Africa,Egypt,EG,EGY
2,Albanien,Tirana,1343,1181,43,498,243,4914,0,0,0,38,8260,2019,,Albanien,AL,Europe,Southern Europe,Albania,AL,ALB
3,Algerien,Algier,300,55,34,33,145,155,0,0,0,85,807,2019,,Algerien,DZ,Africa,Northern Africa,Algeria,DZ,DZA
4,Angola,Luanda,8,8,5,5,4,13,0,0,0,16,59,2019,,Angola,AO,Africa,Sub-Saharan Africa,Angola,AO,AGO


In [306]:
#check length mismatch
print(len(AA_DVISA_DATA_coded))
print(len(AA_DVISA_DATA_regionalized))

1036
1036


In [316]:
#aggregate all visas issued by continent and year
GRANTED_DVISA_continent = AA_DVISA_DATA_regionalized.groupby(by=['continent','year'])['Gesamt'].sum().reset_index()
GRANTED_DVISA_continent

Unnamed: 0,continent,year,Gesamt
0,Africa,2019,28877
1,Africa,2020,16026
2,Africa,2021,30871
3,Africa,2022,39122
4,Africa,2023,43917
5,Africa,2024,50815
6,Americas,2019,23517
7,Americas,2020,11830
8,Americas,2021,18621
9,Americas,2022,23702


In [309]:
line = alt.Chart(GRANTED_DVISA_continent).mark_line().encode(
    x='year',
    y='Gesamt',
    color='continent'
).properties(width=600)

dots = alt.Chart(GRANTED_DVISA_continent).mark_point().encode(
    x='year',
    y='Gesamt',
    color='continent'
).properties(width=600, title='Granted national visa over time by continent (note that differences between continents might be due to the number of applications filed')

line+dots

In [310]:
#aggreating all visas both total granted (gesamt) and work visas (erwerbstätigkeit) per year and continent
GRANTED_DVISA_Erwerb_continent = AA_DVISA_DATA_regionalized.groupby(by=['continent','year']).agg({
    'Gesamt': 'sum',
    'Erwerbstätigkeit': 'sum',
}).reset_index()

GRANTED_DVISA_Erwerb_continent #reporting-relevant

Unnamed: 0,continent,year,Gesamt,Erwerbstätigkeit
0,Africa,2019,28877,5694
1,Africa,2020,16026,3078
2,Africa,2021,30871,7542
3,Africa,2022,39122,12314
4,Africa,2023,43917,16992
5,Africa,2024,50815,20545
6,Americas,2019,23517,10659
7,Americas,2020,11830,5013
8,Americas,2021,18621,8019
9,Americas,2022,23702,11491


In [311]:
#aggreating all visas both total granted (gesamt) and work visas (erwerbstätigkeit) per year and region
GRANTED_DVISA_Erwerb_continent = AA_DVISA_DATA_regionalized.groupby(by=['region','year']).agg({
    'Gesamt': 'sum',
    'Erwerbstätigkeit': 'sum',
}).reset_index()

GRANTED_DVISA_Erwerb_continent #reporting-relevant

Unnamed: 0,region,year,Gesamt,Erwerbstätigkeit
0,Australia and New Zealand,2019,1038,746
1,Australia and New Zealand,2020,548,354
2,Australia and New Zealand,2021,486,293
3,Australia and New Zealand,2022,734,537
4,Australia and New Zealand,2023,1266,868
5,Australia and New Zealand,2024,1539,1169
6,Central America,2019,5420,1512
7,Central America,2020,2305,636
8,Central America,2021,4397,1492
9,Central America,2022,5507,1944
