In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

In [2]:
try:
    data = pd.read_csv('to_model.csv')

except FileNotFoundError:
    url = 'https://raw.githubusercontent.com/deividvalerius/Multidimensional-Poverty-Predictor/master/Data/to_model.csv'
    data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0,civil_status_married,civil_status_never_married,civil_status_separated,region_basque_country,region_castile–la_mancha,region_andalusia,region_castile_and_leon,region_cantabria,population_density_thinly-populated_area,citizenship_spain,...,occupation_technicians_and_associate_professionals,occupation_non-defined,bad_health_yes,bad_health_no,age,years_worked,hours_week_worked,adjusted_income,proportion_social_welfare,material_deprivation
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.764706,0.723077,0.0,0.222241,0.0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.735294,0.030769,0.0,0.222241,0.0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.794118,0.307692,0.0,0.258452,0.0,0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.617647,0.538462,0.0,0.258452,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.529412,0.461538,0.0,0.124813,0.0,0


In [4]:
X = data.drop(['material_deprivation'], axis=1)
y = data.material_deprivation

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
#Imbalanced data

sgd = SGDClassifier()

sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8794
           1       1.00      0.00      0.00       392

    accuracy                           0.96      9186
   macro avg       0.98      0.50      0.49      9186
weighted avg       0.96      0.96      0.94      9186



In [6]:
#Balanced data - Undersample

undersampler = RandomUnderSampler(sampling_strategy='majority')
X_train_us, y_train_us = undersampler.fit_resample(X_train, y_train)

In [7]:
sgd = SGDClassifier()

sgd.fit(X_train_us, y_train_us)
y_pred = sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.84      0.91      8794
           1       0.17      0.71      0.27       392

    accuracy                           0.84      9186
   macro avg       0.58      0.78      0.59      9186
weighted avg       0.95      0.84      0.88      9186



In [8]:
#Balanced data - Oversample

oversampler = RandomOverSampler(sampling_strategy='minority')
X_train_os, y_train_os = oversampler.fit_resample(X_train, y_train)

In [9]:
sgd = SGDClassifier()

sgd.fit(X_train_os, y_train_os)
y_pred = sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.81      0.89      8794
           1       0.15      0.79      0.26       392

    accuracy                           0.81      9186
   macro avg       0.57      0.80      0.57      9186
weighted avg       0.95      0.81      0.86      9186



In [10]:
#Balanced data - SMOTE

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [12]:
sgd = SGDClassifier()

sgd.fit(X_train_smote, y_train_smote)
y_pred = sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.77      0.87      8794
           1       0.14      0.83      0.24       392

    accuracy                           0.77      9186
   macro avg       0.56      0.80      0.55      9186
weighted avg       0.95      0.77      0.84      9186

