"Which countries are likely to experience high refugee counts based on GDP?"

## Libraries and settings

In [2]:
# Libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image

from sklearn import tree
from sklearn.metrics import RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

# Show version of scikit-learn
import sklearn
sklearn.__version__

/workspaces/DataAnalyticsLN/Classification as Modelling Method


'1.6.0'

## Import data

In [3]:
#Read and select variables
df_refugees_orig = pd.read_csv("refugee_data_and_gdp.csv",sep=',',encoding="utf-8")
#Number of rows and columns
print(df_refugees_orig.shape)

#first 5 rows
df_refugees_orig.head(5)

(2160, 7)


Unnamed: 0,year,country_name,gdp,total_refugees,coo_name,total,male_female_ratio
0,2001,France,1370377000000.0,6036.0,Afghanistan,1284.0,1.221453
1,2001,France,1370377000000.0,6036.0,Syrian Arab Rep.,192.0,2.918367
2,2001,France,1370377000000.0,6036.0,Iran (Islamic Rep. of),1987.0,1.806497
3,2001,France,1370377000000.0,6036.0,Iraq,1629.0,1.291139
4,2001,France,1370377000000.0,6036.0,Ukraine,407.0,1.014851


In [4]:
# Check the data information and summary statistics
print(df_refugees_orig.info())
print(df_refugees_orig.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               2160 non-null   int64  
 1   country_name       2160 non-null   object 
 2   gdp                2160 non-null   float64
 3   total_refugees     2160 non-null   float64
 4   coo_name           2114 non-null   object 
 5   total              2114 non-null   float64
 6   male_female_ratio  883 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 118.3+ KB
None
              year           gdp  total_refugees         total  \
count  2160.000000  2.160000e+03    2.160000e+03  2.114000e+03   
mean   2012.158796  1.213910e+12    1.883347e+05  2.893509e+04   
std       6.576959  2.085962e+12    6.145762e+05  2.281973e+05   
min    2001.000000  0.000000e+00    0.000000e+00  5.000000e+00   
25%    2007.000000  0.000000e+00    8.340000e+02  1.860000e+02 

### Variable description

### Count and remove missing values

In [5]:
# Step 3: Data Cleaning and Feature Engineering
# Drop any irrelevant or null columns
df = df_refugees_orig.dropna()  # Drop rows with missing values

# Total number of records
total_records = len(df_refugees_orig)

# Display total number of records
print(f"Total number of records: {total_records}")

# Count missing values
print(df_refugees_orig.isna().sum())

# Remove missing values
df_refugees_orig = df_refugees_orig.dropna(subset=['year', 'country_name', 'gdp', 'total_refugees', 'coo_name', 'total','male_female_ratio'])

Total number of records: 2160
year                    0
country_name            0
gdp                     0
total_refugees          0
coo_name               46
total                  46
male_female_ratio    1277
dtype: int64


In [6]:
# Total number of records
total_records = len(df_refugees_orig)

# Display total number of records
print(f"Total number of records after cleaning: {total_records}")

# Count after remove missing values
print(df_refugees_orig.isna().sum())

Total number of records after cleaning: 883
year                 0
country_name         0
gdp                  0
total_refugees       0
coo_name             0
total                0
male_female_ratio    0
dtype: int64


### Transform nominal variable to matrix with 0/1 values

In [12]:
# Categorize total_refugees into Low, Medium, High
bins = [0, 5000, 20000, float('inf')]
labels = ['Low', 'Medium', 'High']
df['refugee_category'] = pd.cut(df['total_refugees'], bins=bins, labels=labels)


### Barchart refugee count by country of origin

In [8]:
# Create a pivot table
table = df_refugees_orig[['total_refugees','coo']].pivot_table(index='total_refugees', 
                                        columns=['coo'], 
                                        aggfunc=len)

# Plot a stacked bar chart
table.plot(kind='bar', 
           stacked=True, 
           ylabel='Counts', 
           xlabel='Years',
           title='Survival Status Count by Gender', 
           rot=0,
           figsize=(6,4))

plt.show()

KeyError: "['coo'] not in index"

### Pivot table

In [25]:
# Using pivot_table to reshape the data and calculate means
pivot_table = pd.pivot_table(
    df_refugees_orig[['year', 'country_name', 'gdp', 'total_refugees', 'coo_name']],
    index=['country_name'],  # Rows
    values=['gdp', 'total_refugees'],  # Columns to aggregate
    aggfunc=np.mean  # Aggregation function
).round(0)

# Display the pivot table
print(pivot_table)


                                                             gdp  \
country_name                                                       
France                                              2.508484e+12   
Germany                                             3.866673e+12   
Greece                                              2.141001e+11   
Italy                                               2.183080e+12   
Portugal                                            2.641472e+11   
Spain                                               1.438017e+12   
Sweden                                              5.760937e+11   
Switzerland                                         6.056490e+11   
Türkiye                                             0.000000e+00   
United Kingdom of Great Britain and Northern Ir...  0.000000e+00   
United States of America                            0.000000e+00   

                                                    total_refugees  
country_name                                  

### Transform nominal variable to matrix with 0/1  values

### Create binary variable xxx

## Classification Tree

### Create train and test samples (train 80%, test 20% of the data)

### Fit the classification tree model and make predictions

### Show confusion matrix and classification report

### Print text representation of the classification tree

### Visualize the classification tree

## Random Forest Classifier

### Create train and test samples (train 80%, test 20% of the data)

### Fit the Random Forest Classifier

### Show confusion matrix and classification report

### Show feature importance

### ROC curve and AUC