# Data Preparation

## Clean data and create new variables

### Libraries and settings

In [404]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/Users/almahalimi/Documents/GitHub/DataAnalyticsLN


### Importing data

In [405]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('unhcr_refugee_detailed_data.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()


refugee_data_and_gdp.csv
prepared_refugee_data.csv
gdp_data_1998_2024.csv
combined.csv
unhcr_refugee_detailed_data.csv


Unnamed: 0,year,coo_id,coa_id,coo_name,coa_name,coo,coa,coo_iso,coa_iso,f_0_4,...,f_other,f_total,m_0_4,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total
0,2001,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,0,0,0,7756
1,2002,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,0,0,0,9227
2,2003,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,0,0,0,9821
3,2004,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,0,0,0,9923
4,2005,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,0,0,0,8931


### Count number of rows and columns in the data frame

In [406]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1008, 24)
Number of rows: 1008
Number of columns: 24


### Get data types

In [407]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

year         int64
coo_id       int64
coa_id       int64
coo_name    object
coa_name    object
coo         object
coa         object
coo_iso     object
coa_iso     object
f_0_4        int64
f_5_11       int64
f_12_17      int64
f_18_59      int64
f_60         int64
f_other      int64
f_total      int64
m_0_4        int64
m_5_11       int64
m_12_17      int64
m_18_59      int64
m_60         int64
m_other      int64
m_total      int64
total        int64
dtype: object

### Extract and save relevant information from raw data using regular expressions (regex)

### Calculate Decade

In [408]:
# Calculate the decade
df['decade'] = (df['year'] // 10) * 10

# Verify unique decades
print("Unique decades in data:", df['decade'].unique())

Unique decades in data: [2000 2010 2020]


### Calculate Male-to-Female Ratio

In [409]:
# Replace zeros in f_total to prevent division by zero
df['f_total'].replace(0, np.nan, inplace=True)

# Calculate male-to-female ratio
df['male_female_ratio'] = df['m_total'] / df['f_total']

# Replace infinite values with NaN
df['male_female_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Verify calculation
print(df[['year', 'f_total', 'm_total', 'male_female_ratio']].head())


   year  f_total  m_total  male_female_ratio
0  2001      NaN        0                NaN
1  2002      NaN        0                NaN
2  2003      NaN        0                NaN
3  2004      NaN        0                NaN
4  2005      NaN        0                NaN


### Calculate Origin-Destination Total

In [410]:
# Aggregate total refugees per origin-destination pair
df['origin_destination_total'] = df.groupby(['coo_name', 'coa_name'])['total'].transform('sum')

# Verify calculation
print(df[['coo_name', 'coa_name', 'origin_destination_total']].head())


      coo_name                  coa_name  origin_destination_total
0  Afghanistan  United States of America                    160536
1  Afghanistan  United States of America                    160536
2  Afghanistan  United States of America                    160536
3  Afghanistan  United States of America                    160536
4  Afghanistan  United States of America                    160536


### New Variables

In [411]:
# Display key variables to verify correctness
print(df[['year', 'decade', 'male_female_ratio', 'origin_destination_total']].head())

   year  decade  male_female_ratio  origin_destination_total
0  2001    2000                NaN                    160536
1  2002    2000                NaN                    160536
2  2003    2000                NaN                    160536
3  2004    2000                NaN                    160536
4  2005    2000                NaN                    160536


### Get data types of all variables including the new ones

In [412]:
df.dtypes

year                          int64
coo_id                        int64
coa_id                        int64
coo_name                     object
coa_name                     object
coo                          object
coa                          object
coo_iso                      object
coa_iso                      object
f_0_4                         int64
f_5_11                        int64
f_12_17                       int64
f_18_59                       int64
f_60                          int64
f_other                       int64
f_total                     float64
m_0_4                         int64
m_5_11                        int64
m_12_17                       int64
m_18_59                       int64
m_60                          int64
m_other                       int64
m_total                       int64
total                         int64
decade                        int64
male_female_ratio           float64
origin_destination_total      int64
dtype: object

### Count and identify missing values (if any)

In [413]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

year                          0
coo_id                        0
coa_id                        0
coo_name                      0
coa_name                      0
coo                           0
coa                           0
coo_iso                       0
coa_iso                       0
f_0_4                         0
f_5_11                        0
f_12_17                       0
f_18_59                       0
f_60                          0
f_other                       0
f_total                     553
m_0_4                         0
m_5_11                        0
m_12_17                       0
m_18_59                       0
m_60                          0
m_other                       0
m_total                       0
total                         0
decade                        0
male_female_ratio           553
origin_destination_total      0
dtype: int64


Unnamed: 0,year,coo_id,coa_id,coo_name,coa_name,coo,coa,coo_iso,coa_iso,f_0_4,...,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total,decade,male_female_ratio,origin_destination_total
0,2001,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,7756,2000,,160536
1,2002,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,9227,2000,,160536
2,2003,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,9821,2000,,160536
3,2004,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,9923,2000,,160536
4,2005,2,202,Afghanistan,United States of America,AFG,USA,AFG,USA,0,...,0,0,0,0,0,0,8931,2000,,160536


### Count and identify duplicated values (if any)

In [414]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values:
df[df.duplicated()].head()


0


Unnamed: 0,year,coo_id,coa_id,coo_name,coa_name,coo,coa,coo_iso,coa_iso,f_0_4,...,m_5_11,m_12_17,m_18_59,m_60,m_other,m_total,total,decade,male_female_ratio,origin_destination_total


### Save data to file

In [415]:
df.to_csv('prepared_refugee_data.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

## Combine, Organize and Enrich

To see whether higher GDP correlates with more refugees being accepted.
We see the coa country in the "prepared_refugee_data.csv" file and the year, then we take the same country name and same year and give the GDP of the country as well as how many refugees in that year were accepted.

In [416]:
import pandas as pd

# Read the datasets
refugee_df = pd.read_csv('prepared_refugee_data.csv', sep=',', encoding='utf-8')
gdp_df = pd.read_csv('gdp_data_1998_2024.csv', sep=',', encoding='utf-8')

# Rename 'coa_name' to 'country_name' in the refugee dataset for merging
refugee_df.rename(columns={'coa_name': 'country_name'}, inplace=True)


In [417]:
# Select relevant columns
refugee_df = refugee_df[['country_name', 'year', 'total']]
gdp_df = gdp_df[['country_name', 'year', 'gdp']]


In [418]:
# Select relevant columns
refugee_df = refugee_df[['country_name', 'year', 'total']]
gdp_df = gdp_df[['country_name', 'year', 'gdp']]

# Rename columns for clarity
refugee_df.rename(columns={'total': 'refugee_count'}, inplace=True)

In [419]:
# Merge datasets
combined_df = pd.merge(refugee_df, gdp_df, on=['country_name', 'year'], how='inner')

# Inspect the merged dataset
print(combined_df.head())

  country_name  year  refugee_count           gdp
0       France  2001           1284  1.370377e+12
1       France  2002           1256  1.492428e+12
2       France  2003           1167  1.835096e+12
3       France  2004           1220  2.109792e+12
4       France  2005           1228  2.192146e+12


In [420]:
# Reorder columns for clarity
combined_df = combined_df[['country_name', 'year', 'gdp', 'refugee_count']]

# Save the combined dataset
combined_df.to_csv('combined.csv', index=False)
print("✅ Combined dataset saved as 'combined.csv'")

✅ Combined dataset saved as 'combined.csv'


## Final Dataset

Cleaned, new variable added, prepared and enriched

In [421]:
# Load datasets
prepared_refugee_df = pd.read_csv('prepared_refugee_data.csv')
combined_df = pd.read_csv('combined.csv')

# Rename 'coa_name' to 'country_name' for alignment
prepared_refugee_df.rename(columns={'coa_name': 'country_name'}, inplace=True)

# Outer join to preserve all data
ultimate_df = pd.merge(prepared_refugee_df, combined_df, on=['year', 'country_name'], how='outer')

In [422]:
# Fill missing GDP values with 0 or a placeholder
ultimate_df['gdp'].fillna(0, inplace=True)

# Fill missing refugee counts with 0 or a placeholder
ultimate_df['refugee_count'].fillna(0, inplace=True)

# Optionally, drop rows with critical missing data
ultimate_df.dropna(subset=['country_name', 'year'], inplace=True)

In [423]:
# Remove duplicate rows if any
ultimate_df.drop_duplicates(inplace=True)

In [424]:
# Reorder columns for clarity
ultimate_df = ultimate_df[['year', 'country_name', 'gdp', 'refugee_count', 'coo_name', 'total', 'male_female_ratio']]

In [425]:
# Save the final dataset
ultimate_df.to_csv('refugee_data_and_gdp.csv', index=False)
print("✅ Dataset saved as 'refugee_data_and_gdp.csv'")

✅ Dataset saved as 'refugee_data_and_gdp.csv'
