# Data Preprocessing - Parte 4 (Maps)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def load_data(file_path, sep=',', encoding='utf-8'):
    """Load data from a CSV file into a pandas DataFrame."""
    try:
        data = pd.read_csv(file_path, sep=sep, encoding=encoding)
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None

def summarize_data(data):
    """Generate summary statistics of the DataFrame."""
    if data is not None:
        summary = data.describe()
        print("Data summary:")
        print(summary)
        return summary
    else:
        print("No data to summarize.")
        return None

In [5]:
data = load_data("../data/section_4/migration_year_cumulative.csv", sep=',')
summary = summarize_data(data)

Data loaded successfully.
Data summary:
              year    population
count  2052.000000  2.052000e+03
mean   2013.667154  1.075480e+05
std       6.983775  6.799276e+05
min    2001.000000  5.000000e+00
25%    2008.000000  4.175000e+01
50%    2014.000000  2.595000e+02
75%    2020.000000  3.665000e+03
max    2024.000000  1.125695e+07


In [6]:
data.head()

Unnamed: 0,year,origin_location_code,asylum_location_code,population
0,2001,AFG,ARE,66
1,2001,AFG,AUS,6714
2,2001,AFG,AUT,1049
3,2001,AFG,AZE,3522
4,2001,AFG,BEL,714


In [None]:
# Check year == 2001
# From afg (origin_location_code == AFG)...
data[(data['year'] == 2001) & (data['origin_location_code'] == 'AFG')][['asylum_location_code', 'population']]


year
2001        30
2002        18
2003       190
2004        98
2005        86
2006       117
2007       129
2008       141
2009       150
2010      6600
2011      3303
2012     16475
2013     17184
2014    861128
2015    731923
2016    179679
2017    228426
2018    217517
2019    217422
2020    217335
2021    201600
2022    157131
2023    105228
2024     63708
Name: population, dtype: int64


In [None]:
# Sum of population grouping by year and origin_location_code
pop_sum = data.groupby(['year', 'origin_location_code'])['population'].sum().reset_index()
# print pop sum for origin_location_code != AFG
print(pop_sum[pop_sum['origin_location_code'] != 'AFG'].groupby('year')['population'].sum())


In [18]:
# print pop sum for origin_location_code == AFG
print(pop_sum[pop_sum['origin_location_code'] == 'AFG'])


    year origin_location_code  population
0   2001                  AFG     9994718
3   2002                  AFG     5145439
6   2003                  AFG     6215659
10  2004                  AFG     6397976
13  2005                  AFG     5247169
16  2006                  AFG     5135322
18  2007                  AFG     5789045
21  2008                  AFG     6316556
25  2009                  AFG     6613409
29  2010                  AFG     9542285
33  2011                  AFG     9111306
37  2012                  AFG     8797143
40  2013                  AFG     6915973
43  2014                  AFG     6591503
47  2015                  AFG     6988558
51  2016                  AFG     6655304
55  2017                  AFG     7958819
60  2018                  AFG     8178373
66  2019                  AFG     7945902
71  2020                  AFG     6746264
76  2021                  AFG     9976609
81  2022                  AFG    19882655
86  2023                  AFG    2