In [1]:
# Import Dependencies

import os
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from config import conn

from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [4]:
# Create database connection
engine = create_engine(conn)
# pd.read_sql_table('table_name', 'postgres:///db_name')  


data_df = pd.read_sql_table("Trafficking_Cleaned", conn)
data_df

Unnamed: 0,yearOfRegistration,Datasource,gender,citizenship,isForcedLabour,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...,...,...,...
14294,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,2018,Hotline,Male,US,0,Other,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [12]:
# Further reduce the number of features that we're using. Really hone in on the question asked.
model_df = data_df.drop(["yearOfRegistration", "Datasource", "isForcedLabour"], axis=1)
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,Female,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,Female,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,Male,US,Other,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [13]:
# Check data for invalid entries carried over from ETL/EDA
model_df['citizenship'].value_counts()

0     8353
US    3590
UA    1016
BY     232
MM     186
KG     157
KH     139
NG      98
HT      89
LK      73
ID      69
UG      66
TH      53
PH      38
KE      29
NP      27
UZ      25
CN      23
MX      19
KR      12
AF       4
ER       1
Name: citizenship, dtype: int64

In [14]:
# Change 0 to null so rows can be dropped
model_df['citizenship'].replace({'0': np.nan}, inplace=True)
model_df['citizenship'].value_counts()

US    3590
UA    1016
BY     232
MM     186
KG     157
KH     139
NG      98
HT      89
LK      73
ID      69
UG      66
TH      53
PH      38
KE      29
NP      27
UZ      25
CN      23
MX      19
KR      12
AF       4
ER       1
Name: citizenship, dtype: int64

In [15]:
# Drop rows with null values
model_df.dropna(inplace=True)
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,Female,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,Female,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,Female,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,Male,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,Male,US,Other,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [16]:
# Gender transformation
gender_transform = {
    "Male": 0, 
    "Female": 1
}

# Encode labeled data so the model can interpret it correctly
model_df['gender'] = model_df['gender'].apply(lambda x: gender_transform[x])
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,1,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,1,LK,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,1,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,1,LK,Financial,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,0,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,0,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,0,US,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,0,US,Other,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [17]:
# Country transformation
country_transform = {
    "US": 840, 
    "UA": 804,
    "BY": 112, 
    "MM": 104, 
    "KG": 417, 
    "KH": 116, 
    "NG": 566,
    "HT": 332,  
    "LK": 144, 
    "ID": 360, 
    "UG": 800, 
    "TH": 764, 
    "PH": 608,
    "KE": 404, 
    "NP": 524, 
    "UZ": 860, 
    "CN": 156, 
    "MX": 484, 
    "KR": 410, 
    "AF": 4, 
    "ER": 232
}

# Encode labeled data so the model can interpret it correctly
model_df['citizenship'] = model_df['citizenship'].apply(lambda x: country_transform[x])
model_df


Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,1,144,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,1,144,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,1,144,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,1,144,Financial,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,0,840,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,0,840,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,0,840,Threats,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,0,840,Other,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [18]:
# Transform the ControlCategory
control_transform = {
    "Financial": 1, 
    "Threats": 2, 
    "Survival": 3, 
    "Physical": 4, 
    "Other": 5
}

# Encode data so the model can interpret it correctly
model_df['ControlCategory'] = model_df['ControlCategory'].apply(lambda x: control_transform[x])
model_df


Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,2,Other,Forced Labor,Domestic Work,Age 30-38
1,1,144,1,Other,Forced Labor,Domestic Work,Age 30-38
2,1,144,2,Other,Forced Labor,Domestic Work,Age 30-38
3,1,144,1,Other,Forced Labor,Domestic Work,Age 30-38
4,1,144,1,Other,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,0,840,2,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14295,0,840,2,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14296,0,840,2,Family/Relative,Sexual Exploitation,Unknown,Age 9-17
14297,0,840,5,Family/Relative,Sexual Exploitation,Unknown,Age 9-17


In [19]:
# Transform the Recruiter Category
recruiter_transform = {
    "Not Specified": 1, 
    "Other": 2, 
    "Friend/Acquaintance": 3, 
    "Family/Relative": 4, 
    "Intimate Partner": 5
}

# Encode data so the model can interpret it correctly
model_df['RecruiterCategory'] = model_df['RecruiterCategory'].apply(lambda x: recruiter_transform[x])
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,2,2,Forced Labor,Domestic Work,Age 30-38
1,1,144,1,2,Forced Labor,Domestic Work,Age 30-38
2,1,144,2,2,Forced Labor,Domestic Work,Age 30-38
3,1,144,1,2,Forced Labor,Domestic Work,Age 30-38
4,1,144,1,2,Forced Labor,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,0,840,2,4,Sexual Exploitation,Unknown,Age 9-17
14295,0,840,2,4,Sexual Exploitation,Unknown,Age 9-17
14296,0,840,2,4,Sexual Exploitation,Unknown,Age 9-17
14297,0,840,5,4,Sexual Exploitation,Unknown,Age 9-17


In [20]:
# Transform the Exploit Type
exploit_transform = {
    "Sexual Exploitation": 1, 
    "Forced Labor": 2, 
    "Other": 3, 
    "Slavery and similar practices": 4, 
    "Forced Marriage": 5
}

# Encode data so the model can interpret it correctly
model_df['ExploitType'] = model_df['ExploitType'].apply(lambda x: exploit_transform[x])
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,2,2,2,Domestic Work,Age 30-38
1,1,144,1,2,2,Domestic Work,Age 30-38
2,1,144,2,2,2,Domestic Work,Age 30-38
3,1,144,1,2,2,Domestic Work,Age 30-38
4,1,144,1,2,2,Domestic Work,Age 30-38
...,...,...,...,...,...,...,...
14294,0,840,2,4,1,Unknown,Age 9-17
14295,0,840,2,4,1,Unknown,Age 9-17
14296,0,840,2,4,1,Unknown,Age 9-17
14297,0,840,5,4,1,Unknown,Age 9-17


In [23]:
model_df["Labor_Type"].unique()

array(['Domestic Work', 'Other', 'Unknown', 'Agriculture',
       'Manufacturing', 'Construction', 'Begging', 'Aquafarming'],
      dtype=object)

In [24]:
# Transform the Labor Type
labor_transform = {
    "Domestic Work": 1, 
    "Other": 2, 
    "Unknown": 3, 
    "Agriculture": 4, 
    "Manufacturing": 5, 
    "Construction": 6, 
    "Begging": 7, 
    "Aquafarming": 8
}

# Encode data so the model can interpret it correctly
model_df['Labor_Type'] = model_df['Labor_Type'].apply(lambda x: labor_transform[x])
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,2,2,2,1,Age 30-38
1,1,144,1,2,2,1,Age 30-38
2,1,144,2,2,2,1,Age 30-38
3,1,144,1,2,2,1,Age 30-38
4,1,144,1,2,2,1,Age 30-38
...,...,...,...,...,...,...,...
14294,0,840,2,4,1,3,Age 9-17
14295,0,840,2,4,1,3,Age 9-17
14296,0,840,2,4,1,3,Age 9-17
14297,0,840,5,4,1,3,Age 9-17


In [25]:
model_df["ageCategories"].unique()

array(['Age 30-38', 'Age 39-47', 'Age 9-17', 'Age 27-29', 'Age 48+',
       'Age 21-23', 'Age 24-26', 'Age 0-8', 'Age 18-20'], dtype=object)

In [26]:
# Transform Age Categories
age_transform = {
    "Age 0-8": 1, 
    "Age 9-17": 2, 
    "Age 18-20": 3, 
    "Age 21-23": 4, 
    "Age 24-26": 5, 
    "Age 27-29": 6, 
    "Age 30-38": 7, 
    "Age 39-47": 8, 
    "Age 48+": 9
}

# Encode data so the model can interpret it correctly
model_df['ageCategories'] = model_df['ageCategories'].apply(lambda x: age_transform[x])
model_df

Unnamed: 0,gender,citizenship,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,1,144,2,2,2,1,7
1,1,144,1,2,2,1,7
2,1,144,2,2,2,1,7
3,1,144,1,2,2,1,7
4,1,144,1,2,2,1,7
...,...,...,...,...,...,...,...
14294,0,840,2,4,1,3,2
14295,0,840,2,4,1,3,2
14296,0,840,2,4,1,3,2
14297,0,840,5,4,1,3,2


In [27]:
# Create our train and test splits
y = model_df['ExploitType']
X = model_df.drop(['ExploitType'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=1, stratify=y)

In [28]:
# Create a pipeline and scale the input
# Per documentation on SGDClassifier, independently scaling didn't seem to work
trafficking_model = make_pipeline(StandardScaler(), SGDClassifier(max_iter=500, tol=1e-3))
trafficking_model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier(max_iter=500))])

In [29]:
# Compare our model's predictions to our known outcomes
y_pred = trafficking_model.predict(X_test)

# Code to create a DataFrame to compare our actual vs. predicted if we need to dive deeper...
# results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
# results

In [30]:
# Check accuracy
accuracy_score(y_test, y_pred)

0.9690652320107599

In [31]:
# Check performance
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.99      0.98      0.99       934
           2       0.93      0.98      0.96       527
           3       1.00      0.57      0.73         7
           5       1.00      0.16      0.27        19

    accuracy                           0.97      1487
   macro avg       0.98      0.67      0.74      1487
weighted avg       0.97      0.97      0.97      1487

