In [4]:
# Data Preprocessing Script

import pandas as pd

In [6]:
# Load dataset
data = pd.read_csv('Global Missing Migrants Dataset.csv')

# Drop columns with excessive null values or irrelevant data
data = data.drop(columns=['Unnamed: 19', 'Migration route', 'Information Source'])

In [8]:
# Handle missing values
# Number of Dead is important, fill nulls with median as deaths can't reasonably be negative
data['Number of Dead'].fillna(data['Number of Dead'].median(), inplace=True)

# Region of Origin and Country of Origin: Fill missing values with 'Unknown'
data['Region of Origin'].fillna('Unknown', inplace=True)
data['Country of Origin'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Number of Dead'].fillna(data['Number of Dead'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Region of Origin'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [10]:
# Coordinates: Drop rows where coordinates are missing, essential for geospatial analysis
data.dropna(subset=['Coordinates'], inplace=True)

# Convert Coordinates into Latitude and Longitude
data[['Latitude', 'Longitude']] = data['Coordinates'].str.split(',', expand=True).astype(float)
data.drop(columns=['Coordinates'], inplace=True)

In [12]:
# Create a categorical feature for incident year
# Useful to capture any temporal patterns
data['Incident year'] = data['Incident year'].astype('category')

# Simplify Cause of Death: Group less common causes into 'Other'
cause_counts = data['Cause of Death'].value_counts()
less_common_causes = cause_counts[cause_counts < 100].index
data['Cause of Death'] = data['Cause of Death'].replace(less_common_causes, 'Other')

In [14]:
# Verify cleaned data
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 12984 entries, 0 to 13019
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   Incident Type                        12984 non-null  object  
 1   Incident year                        12984 non-null  category
 2   Reported Month                       12984 non-null  object  
 3   Region of Origin                     12984 non-null  object  
 4   Region of Incident                   12984 non-null  object  
 5   Country of Origin                    12984 non-null  object  
 6   Number of Dead                       12984 non-null  float64 
 7   Minimum Estimated Number of Missing  12984 non-null  int64   
 8   Total Number of Dead and Missing     12984 non-null  int64   
 9   Number of Survivors                  12984 non-null  int64   
 10  Number of Females                    12984 non-null  int64   
 11  Number of Males     