In [1]:
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
# Local sample dataset on separate branch
# trafficking_df = pd.read_csv(os.path.join("DataBase", "human_traffick_sample_dataset.csv"))

# Dataset uploaded to GitHub
trafficking_df = pd.read_csv("https://raw.githubusercontent.com/gh-mrmoore/AnalyticsFinalProject/MattMoore/Data/human_traffick_sample_dataset_mrm.csv")

trafficking_df.head()

Unnamed: 0,registration_year,datasource,gender,age_range,majority_status,majority_status_at_exploit,majority_entry,citizenship,country_of_exploitation,control_financial,control_threats,control_physical,control_limit_necessities,control_other,labor,victim_purpose,trafficker_relationship,was_trafficked
0,2017,Case Management,Female,0--8,Minor,Minor,-99,-99,KH,-99,-99,1,-99,1,0,-99,-99,1
1,2017,Case Management,Female,0--8,Minor,Minor,-99,-99,KH,-99,1,1,-99,1,-99,6,4,1
2,2017,Case Management,Female,18--20,Adult,Adult,-99,KH,CN,-99,1,1,-99,-99,-99,3,3,1
3,2017,Case Management,Female,18--20,Adult,-99,-99,UA,UA,-99,-99,-99,-99,1,-99,-99,4,1
4,2017,Case Management,Female,21--23,Adult,Minor,-99,VN,VN,-99,1,-99,-99,-99,3,6,-99,1


In [3]:
trafficking_df.replace("-99", np.nan, inplace=True)
trafficking_df.replace(-99, np.nan, inplace=True)
trafficking_df.head()

Unnamed: 0,registration_year,datasource,gender,age_range,majority_status,majority_status_at_exploit,majority_entry,citizenship,country_of_exploitation,control_financial,control_threats,control_physical,control_limit_necessities,control_other,labor,victim_purpose,trafficker_relationship,was_trafficked
0,2017,Case Management,Female,0--8,Minor,Minor,,,KH,,,1.0,,1.0,0.0,,,1
1,2017,Case Management,Female,0--8,Minor,Minor,,,KH,,1.0,1.0,,1.0,,6.0,4.0,1
2,2017,Case Management,Female,18--20,Adult,Adult,,KH,CN,,1.0,1.0,,,,3.0,3.0,1
3,2017,Case Management,Female,18--20,Adult,,,UA,UA,,,,,1.0,,,4.0,1
4,2017,Case Management,Female,21--23,Adult,Minor,,VN,VN,,1.0,,,,3.0,6.0,,1


In [4]:
# Categories
categories = trafficking_df.dtypes[trafficking_df.dtypes == "object"].index.tolist()
categories

['datasource',
 'gender',
 'age_range',
 'majority_status',
 'majority_status_at_exploit',
 'majority_entry',
 'citizenship',
 'country_of_exploitation']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(trafficking_df[categories]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(categories)
encode_df.head()

Unnamed: 0,datasource_Case Management,datasource_Hotline,gender_Female,age_range_0--8,age_range_18--20,age_range_21--23,age_range_24--26,age_range_27--29,age_range_30--38,age_range_9--17,...,country_of_exploitation_BD,country_of_exploitation_CN,country_of_exploitation_IN,country_of_exploitation_KH,country_of_exploitation_MD,country_of_exploitation_MY,country_of_exploitation_SA,country_of_exploitation_UA,country_of_exploitation_US,country_of_exploitation_VN
0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
trafficking_df = trafficking_df.merge(encode_df, left_index=True, right_index=True)
trafficking_df = trafficking_df.drop(categories,1)
trafficking_df.head()

Unnamed: 0,registration_year,control_financial,control_threats,control_physical,control_limit_necessities,control_other,labor,victim_purpose,trafficker_relationship,was_trafficked,...,country_of_exploitation_BD,country_of_exploitation_CN,country_of_exploitation_IN,country_of_exploitation_KH,country_of_exploitation_MD,country_of_exploitation_MY,country_of_exploitation_SA,country_of_exploitation_UA,country_of_exploitation_US,country_of_exploitation_VN
0,2017,,,1.0,,1.0,0.0,,,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017,,1.0,1.0,,1.0,,6.0,4.0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017,,1.0,1.0,,,,3.0,3.0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017,,,,,1.0,,,4.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2017,,1.0,,,,3.0,6.0,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# Skip binning and encoding for starters and see what happens
y = trafficking_df["was_trafficked"].values
X = trafficking_df.drop(columns=["was_trafficked"])

# Split for training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 52)

In [8]:
X_train

Unnamed: 0,registration_year,control_financial,control_threats,control_physical,control_limit_necessities,control_other,labor,victim_purpose,trafficker_relationship,datasource_Case Management,...,country_of_exploitation_BD,country_of_exploitation_CN,country_of_exploitation_IN,country_of_exploitation_KH,country_of_exploitation_MD,country_of_exploitation_MY,country_of_exploitation_SA,country_of_exploitation_UA,country_of_exploitation_US,country_of_exploitation_VN
205,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
177,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,2018,,1.0,,,1.0,,1.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
179,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,2018,,,,,,,,,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2018,,,,,,,,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
151,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2017,,,1.0,,,,3.0,2.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2017,,,,,,,3.0,,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Define deep neural net model
input_feature_number = len(X_train["registration_year"])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 50

nn = tf.keras.models.Sequential()

# 1st hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=44, input_shape=(44,), activation="relu"))

# 2nd hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Summary check
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4500      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 9,601
Trainable params: 9,601
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Checkpoints
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/Project_Checkpoints.hdf5"

In [11]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq='epoch')

# Train the model
fit_model = nn.fit(X_train,y_train,epochs=50,callbacks=[cp_callback])

Epoch 1/50

Epoch 00001: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 2/50

Epoch 00002: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 3/50

Epoch 00003: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 4/50

Epoch 00004: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 5/50

Epoch 00005: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 6/50

Epoch 00006: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 7/50

Epoch 00007: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 8/50

Epoch 00008: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 9/50

Epoch 00009: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 10/50

Epoch 00010: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 11/50

Epoch 00011: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 12/50

Epoch 00012: saving model to checkpoints\Project_Checkpoints.hdf5
Epoch 13/50

Epoch 00013: saving model to checkpoints\Project