In [2]:
# Import Dependencies

import os
import pandas as pd
import numpy as np
# from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [3]:
# Get unemployment data
unemployment_df = pd.read_csv(os.path.join("../../../Data", "WEO_Unemployment.csv"), index_col=["ISO"])
unemployment_df

Unnamed: 0_level_0,WEO Subject Code,Country,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AFG,LUR,Afghanistan,,,,,,,,,,,,,,,,,
ALB,LUR,Albania,15.751,15.000,14.400,14.100,13.800,13.400,13.100,13.800,14.000,14.000,13.400,15.900,17.5,17.100,15.200,13.700,12.300
DZA,LUR,Algeria,25.664,23.716,17.656,15.265,12.512,13.793,11.343,10.167,9.961,9.971,10.969,9.829,10.6,11.214,10.498,11.709,11.731
AGO,LUR,Angola,,,,,,,,,,,,,,,,,
ATG,LUR,Antigua and Barbuda,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VNM,LUR,Vietnam,6.010,5.780,5.600,5.310,4.820,4.640,2.380,2.900,2.880,2.220,1.960,2.180,2.1,2.330,2.300,2.240,2.190
WBG,LUR,West Bank and Gaza,31.200,25.500,26.800,23.500,23.700,21.700,26.600,24.500,23.700,20.900,23.000,23.400,26.9,25.900,26.900,25.450,26.250
YEM,LUR,Yemen,,,,,,,,,,,,,,,,,
ZMB,LUR,Zambia,,,,,,,,,,,,,,,,,


In [4]:
# Eliminate unemployment data features we won't need
unemployment_df.drop(["WEO Subject Code", "Country"], axis=1, inplace=True)
unemployment_df

Unnamed: 0_level_0,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AFG,,,,,,,,,,,,,,,,,
ALB,15.751,15.000,14.400,14.100,13.800,13.400,13.100,13.800,14.000,14.000,13.400,15.900,17.5,17.100,15.200,13.700,12.300
DZA,25.664,23.716,17.656,15.265,12.512,13.793,11.343,10.167,9.961,9.971,10.969,9.829,10.6,11.214,10.498,11.709,11.731
AGO,,,,,,,,,,,,,,,,,
ATG,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VNM,6.010,5.780,5.600,5.310,4.820,4.640,2.380,2.900,2.880,2.220,1.960,2.180,2.1,2.330,2.300,2.240,2.190
WBG,31.200,25.500,26.800,23.500,23.700,21.700,26.600,24.500,23.700,20.900,23.000,23.400,26.9,25.900,26.900,25.450,26.250
YEM,,,,,,,,,,,,,,,,,
ZMB,,,,,,,,,,,,,,,,,


In [5]:
unemployment_df.loc['ALB', '2002']

15.751

In [6]:
# Put average unemployment in to its own DataFrame
avg_ue = unemployment_df.mean(axis=1)
avg_ue

avg_ue_df = pd.DataFrame(avg_ue, index=unemployment_df.index, columns=['avg_ue'])
avg_ue_df

Unnamed: 0_level_0,avg_ue
ISO,Unnamed: 1_level_1
AFG,
ALB,14.497118
DZA,13.329294
AGO,
ATG,
...,...
VNM,3.402353
WBG,25.052941
YEM,
ZMB,


In [7]:
# Get GDP data
gdp_df = pd.read_csv(os.path.join("../../../Data", "WEO_GDP.csv"), index_col=["ISO"])
gdp_df.head()

Unnamed: 0_level_0,WEO Subject Code,Country,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AFG,NGDPDPC,Afghanistan,233.433,233.755,254.259,294.396,320.674,381.502,447.746,511.374,631.49,714.7,784.611,754.402,747.622,711.337,616.234,636.693,582.323
ALB,NGDPDPC,Albania,1425.13,1846.12,2373.58,2673.77,2972.75,3595.05,4370.56,4114.09,4097.83,4439.89,4248.91,4415.6,4584.92,3953.61,4124.41,4542.76,5284.44
DZA,NGDPDPC,Algeria,1806.86,2128.39,2636.31,3141.03,3508.96,3986.56,4943.5,3886.06,4480.72,5453.89,5575.65,5477.06,5466.33,4153.32,3918.94,4079.65,4118.74
AGO,NGDPDPC,Angola,841.42,951.969,1222.04,1862.42,2561.86,3099.09,4081.69,3146.8,3641.44,4716.25,5245.02,5436.52,5625.74,4354.92,3676.83,4303.69,3620.59
ATG,NGDPDPC,Antigua and Barbuda,10482.01,10892.4,11581.32,12736.24,14262.1,15992.74,16510.08,14612.9,13564.68,13295.36,13811.08,13392.22,13951.99,14696.89,15556.01,15655.23,16860.96


In [8]:
# Eliminate GDP features we won't need
gdp_df.drop(["WEO Subject Code", "Country"], axis=1, inplace=True)
gdp_df

Unnamed: 0_level_0,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AFG,233.433,233.755,254.259,294.396,320.674,381.502,447.746,511.374,631.490,714.70,784.611,754.402,747.622,711.337,616.234,636.693,582.323
ALB,1425.130,1846.120,2373.580,2673.770,2972.750,3595.050,4370.560,4114.090,4097.830,4439.89,4248.910,4415.600,4584.920,3953.610,4124.410,4542.760,5284.440
DZA,1806.860,2128.390,2636.310,3141.030,3508.960,3986.560,4943.500,3886.060,4480.720,5453.89,5575.650,5477.060,5466.330,4153.320,3918.940,4079.650,4118.740
AGO,841.420,951.969,1222.040,1862.420,2561.860,3099.090,4081.690,3146.800,3641.440,4716.25,5245.020,5436.520,5625.740,4354.920,3676.830,4303.690,3620.590
ATG,10482.010,10892.400,11581.320,12736.240,14262.100,15992.740,16510.080,14612.900,13564.680,13295.36,13811.080,13392.220,13951.990,14696.890,15556.010,15655.230,16860.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VNM,546.555,610.357,756.981,873.136,996.255,1152.300,1446.770,1481.810,1628.520,1950.46,2197.610,2370.090,2566.910,2582.390,2720.190,2957.930,3211.460
WBG,1102.500,1197.160,1350.910,1461.090,1480.700,1563.700,1913.320,2061.560,2406.260,2711.92,2888.600,3122.990,3158.600,3084.130,3325.850,3407.310,3353.230
YEM,560.031,597.771,682.148,797.704,881.648,971.326,1171.170,1060.930,1266.790,1302.30,1367.720,1515.950,1574.250,1500.690,1061.840,891.888,762.136
ZMB,376.468,429.010,530.540,691.554,1030.690,1103.690,1365.930,1135.010,1456.050,1635.17,1724.630,1839.330,1726.650,1310.460,1254.120,1500.960,1480.420


In [9]:
# transpose_ue_df = unemployment_df.T
# transpose_ue_df.reset_index()
# transpose_ue_df.set_index(unemployment_df.columns)
# transpose_ue_df

In [10]:
trafficking_df = pd.read_csv(os.path.join("../../../Exports", "eda5b.csv"))
trafficking_df.head()

Unnamed: 0,yearOfRegistration,Datasource,gender,citizenship,isForcedLabour,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories
0,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38
1,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38
2,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38
3,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38
4,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38


In [11]:
trafficking_df.shape

(14299, 10)

In [12]:
countries_df = pd.read_csv(os.path.join("../../../Data", "countries.csv"))
countries_df.head()

Unnamed: 0,country_name,country_iso2,country_iso3,country_un_code
0,Afghanistan,AF,AFG,4
1,Aland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16


In [13]:
trafficking_merge_df1 = trafficking_df.merge(countries_df, how='left', left_on='citizenship', right_on='country_iso2')
trafficking_merge_df1

Unnamed: 0,yearOfRegistration,Datasource,gender,citizenship,isForcedLabour,ControlCategory,RecruiterCategory,ExploitType,Labor_Type,ageCategories,country_name,country_iso2,country_iso3,country_un_code
0,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38,Sri Lanka,LK,LKA,144.0
1,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38,Sri Lanka,LK,LKA,144.0
2,2012,Case Management,Female,LK,1,Threats,Other,Forced Labor,Domestic Work,Age 30-38,Sri Lanka,LK,LKA,144.0
3,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38,Sri Lanka,LK,LKA,144.0
4,2012,Case Management,Female,LK,1,Financial,Other,Forced Labor,Domestic Work,Age 30-38,Sri Lanka,LK,LKA,144.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14294,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,12,Age 9-17,United States of America,US,USA,840.0
14295,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,12,Age 9-17,United States of America,US,USA,840.0
14296,2018,Hotline,Male,US,0,Threats,Family/Relative,Sexual Exploitation,12,Age 9-17,United States of America,US,USA,840.0
14297,2018,Hotline,Male,US,0,Other,Family/Relative,Sexual Exploitation,12,Age 9-17,United States of America,US,USA,840.0


In [14]:
# x = trafficking_merge_df1.groupby(["country_iso3", "yearOfRegistration"]).agg(['count'])
x = trafficking_merge_df1.groupby(["country_iso3", "yearOfRegistration"]).size().reset_index(name="case_counts")

x

Unnamed: 0,country_iso3,yearOfRegistration,case_counts
0,AFG,2013,3
1,AFG,2014,1
2,BLR,2015,90
3,BLR,2016,112
4,BLR,2017,30
5,CHN,2017,11
6,CHN,2018,12
7,ERI,2014,1
8,HTI,2012,89
9,IDN,2014,1


In [15]:
year_unemployment = []
year_gdp = []
for row in x.iterrows():
    year_unemployment.append(unemployment_df.loc[row[1]['country_iso3'], str(row[1]['yearOfRegistration'])])
    year_gdp.append(gdp_df.loc[row[1]['country_iso3'], str(row[1]['yearOfRegistration'])])

In [16]:
x['Unemployment'] = year_unemployment
x['GDP'] = year_gdp
x_no_null = x.dropna()
x_no_null

Unnamed: 0,country_iso3,yearOfRegistration,case_counts,Unemployment,GDP
2,BLR,2015,90,5.821,5941.24
3,BLR,2016,112,5.92,5022.47
4,BLR,2017,30,5.684,5757.29
5,CHN,2017,11,3.9,8823.46
6,CHN,2018,12,3.8,9919.81
9,IDN,2014,1,5.94,3533.61
10,IDN,2015,6,6.18,3367.69
11,IDN,2016,28,5.61,3605.72
12,IDN,2017,34,5.5,3885.47
14,KGZ,2014,71,8.046,1292.62


In [17]:
# Drop country as a feature since we're looking for an overall model
# See if dropping year helps the model accuracy
x_no_null.drop(columns=["country_iso3", "yearOfRegistration"], axis=1, inplace=True)
x_no_null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,case_counts,Unemployment,GDP
2,90,5.821,5941.24
3,112,5.92,5022.47
4,30,5.684,5757.29
5,11,3.9,8823.46
6,12,3.8,9919.81
9,1,5.94,3533.61
10,6,6.18,3367.69
11,28,5.61,3605.72
12,34,5.5,3885.47
14,71,8.046,1292.62


In [18]:
# Need to scale after splitting since we've got figures from the teens up to tens of thousands
outcomes = x_no_null["case_counts"]
data = x_no_null.drop(columns=["case_counts"])

# Create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, outcomes, random_state=52)

# Scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Run linear regression model
Regular = LinearRegression()

Regular.fit(X_train_scaled, y_train)

LinearRegression()

In [20]:
y_predictions = Regular.predict(X_test_scaled)
y_predictions

array([107.10758487,  70.80455867,  57.41802588,  88.04512831,
       770.42414586,  56.83443423, 347.33690249,  89.54677203])

In [21]:
y_test

36     172
24      19
12      34
4       30
42    1227
11      28
21      12
6       12
Name: case_counts, dtype: int64