# Classification Analysis
This notebook contains work done for classification analysis on the Mozambique
dataset from IPUMS.

## Load Dependencies

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from src.utils.ipums_extract import load_ipums_from_pkl
from src.models.train import train_sklearn_model
from src.models.eval import evaluate_sklearn_model
from src.models.neural_net import (
    train_neural_net,
    evaluate_neural_net
)

## Load Data

In [2]:
PKL_PATH = Path(r"data/mozambique.pkl")
mig1_df, mig5_df = load_ipums_from_pkl(PKL_PATH)

print(mig1_df.shape)
print(mig5_df.shape)

(5929529, 66)
(4974569, 66)


## Compute Class Distribution

In the cells below, we examine the class distribution for our two response
variables. From these results, we can see that we have an extremely unbalanced
dataset. Therefore, for the models implemented in Scikit-Learn we will use
class weights to automatically balance the classes by assigning lower weights
to majority class samples and higher weights to minority class samples. For the
neural network, we will employ dropout layers and a weighted loss function to in
principle accomplish the same goal. The next cell outlines papers where this is
discussed.

"A systematic study of the class imbalance problem in convolutional neural
networks" by Buda et al. (2018)

"Focal Loss for Dense Object Detection" by Lin et al. (2017)

"Learning from Imbalanced Data" by He et al. (2009)

"Dropout: A Simple Way to Prevent Neural Networks from Overfitting" by
Srivastava et al. (2014)

"Deep Learning for imbalanced multimedia data classification" by Pouyanfar et
al. (2018)

"Class-Balanced Loss Based on Effective Number of Samples" by Cui et al. (2019)

In [28]:
print(mig1_df['MIGRATE1'].value_counts())

MIGRATE1
0    5860462
1      69067
Name: count, dtype: int64


In [29]:
print(mig5_df['MIGRATE5'].value_counts())

MIGRATE5
0    4746396
1     228173
Name: count, dtype: int64


## Create Development Splits (Train/Val/Test)

In [18]:
# Split Hyperparameters
SEED = 5523
VAL_RATIO = 0.15
TEST_RATIO = 0.15
TRAIN_RATIO = 1 - VAL_RATIO - TEST_RATIO
assert np.sum([TRAIN_RATIO, VAL_RATIO, TEST_RATIO]) == 1

In [None]:
# MIG1
X1 = mig1_df.drop(columns=['MIGRATE1'], inplace=False, axis=1)
y1 = mig1_df['MIGRATE1'].values

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1,
                                                        test_size=TEST_RATIO,
                                                        random_state=SEED,
                                                        stratify=y1)

X1_train, X1_val, y1_train, y1_val = train_test_split(X1_train, y1_train,
                                                      test_size=VAL_RATIO,
                                                      random_state=SEED,
                                                      stratify=y1_train)

In [27]:
# MIG5
X5 = mig5_df.drop(columns=['MIGRATE5'], inplace=False, axis=1)
y5 = mig5_df['MIGRATE5'].values

X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5,
                                                        test_size=TEST_RATIO,
                                                        random_state=SEED,
                                                        stratify=y5)

X5_train, X5_val, y5_train, y5_val = train_test_split(X5_train, y5_train,
                                                      test_size=VAL_RATIO,
                                                      random_state=SEED,
                                                      stratify=y5_train)

## Random Forest
In this section, we apply a Random Forest implemented in Scikit-Learn to
classify census samples' migration status.

In [None]:
# Initialize Random Forest Classifiers
forest1 = RandomForestClassifier()
forest5 = RandomForestClassifier()

In [None]:
# Define Parameter Grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt'],
    'class_weight': ['balanced'],
    'bootstrap': [True],
    'random_state': [SEED]
}

In [None]:
# Train and Evaluate for MIG1
forest1, forest1_results = train_sklearn_model(
    forest1, param_grid, X1_train, y1_train, X1_val, y1_val
)

forest1_test_results = evaluate_sklearn_model(forest1, X1_test, y1_test)
print(f"\n{forest1_test_results}")

In [None]:
# Train and Evaluate for MIG5
forest5, forest5_results = train_sklearn_model(
    forest5, param_grid, X5_train, y5_train, X5_val, y5_val
)

forest5_test_results = evaluate_sklearn_model(forest5, X5_test, y5_test)
print(f"\n{forest5_test_results}")

## Support Vector Machine (SVM)
In this section, we apply a SVM model implemented in Scikit-Learn to classify
census samples' migration status.

In [None]:
# Initialize SVC
svc1 = SVC()
svc5 = SVC()

In [None]:
# Define Parameter Grid for SVC
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale'],
    'class_weight': ['balanced'],
    'random_state': [SEED]
}

In [None]:
# Train and Evaluate for MIG1
svc1, svc1_results = train_sklearn_model(
    svc1, param_grid, X1_train, y1_train, X1_val, y1_val
)

svc1_test_results = evaluate_sklearn_model(svc1, X1_test, y1_test)
print(f"\n{svc1_test_results}")

In [None]:
# Train and Evaluate for MIG5
svc5, svc5_results = train_sklearn_model(
    svc5, param_grid, X5_train, y5_train, X5_val, y5_val
)

svc5_test_results = evaluate_sklearn_model(svc5, X5_test, y5_test)
print(f"\n{svc5_test_results}")

## Neural Network
In this section, we apply an artificial neural network implemented in PyTorch to
classify census samples' migration status.

In [None]:
# Train and Evaluate for MIG1
nn1, nn1_results = train_neural_net(
    X1_train, y1_train, X1_val, y1_val,
    n_epochs=50,
    batch_size=256,
    learning_rate=0.001
)

nn1_test_results = evaluate_neural_net(nn1, X1_test, y1_test)
print(f"\n{nn1_test_results}")

In [None]:
# Train and Evaluate for MIG5
nn5, nn5_results = train_neural_net(
    X5_train, y5_train, X5_val, y5_val,
    n_epochs=50,
    batch_size=256,
    learning_rate=0.001
)

nn5_test_results = evaluate_neural_net(nn5, X5_test, y5_test)
print(f"\n{nn5_test_results}")