# Trafficking classifier

This notebook contains the code for classifying the extracted trafficking relationships between pairs of countries. Countries are classified as either senders or recievers.

Implemented by Mitchell Goist and Christopher Boylan

In [23]:
import pandas as pd
import numpy as np
import glob
import os
from collections import Counter
from bs4 import BeautifulSoup
import re
# matplotlib
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
# seaborn
import seaborn as sns
# Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
plt.rcParams["axes.labelsize"] = 15
plt.rcParams["ytick.labelsize"] = 'large'

In [24]:
def supervisor(text):
    sentences = text.split('.')
    source = sentences[0].find('source') > -1
    destination = sentences[0].find('destination') > -1
    transit = sentences[0].find('transit') > -1
    if source and not destination and not transit:
        label = 0
    elif destination and not source and not transit:
        label = 1
    else:
        label = ' '
    return label

### Prepare the data for classifiers

In [25]:
# Clean up hand-coding
mitch = pd.read_csv('../relation_extracction/annotation/mitch_coding.csv',
                    index_col=0)
chris = pd.read_csv('../relation_extracction/annotation/christopher_coding.csv',
                   index_col=0)
hand_codes = pd.concat([mitch, chris])

In [26]:
sentences = pd.read_csv('extracted_entities_dictionary_sents_v2.csv')

In [27]:
# Remove duplicate sentences to prevent extra rows from being created in the merge
clean_training = hand_codes.drop_duplicates(subset='text')
clean_training = clean_training[['file', 'text', 'label']]
clean_training['label'] = pd.to_numeric(clean_training['label'], errors='coerce')
clean_training

Unnamed: 0,file,text,label
3901,../2011/Sudan.txt,FILIPINA MIGRANT DOMESTIC WORKERS MAY ALSO BE ...,1.0
1002,../2005/Switzerland.txt,\n\nSWITZERLAND IS PRIMARILY A DESTINATION COU...,1.0
1827,../2008/Chile.txt,"MIGRANTS FROM PERU AND BOLIVIA, INCLUDING CHIL...",1.0
4419,../2012/Morocco.txt,MOROCCAN WOMEN ARE FORCED INTO PROSTITUTION IN...,0.0
905,../2005/Kenya.txt,"ASIAN NATIONALS, MAINLY CHINESE WOMEN, ARE REP...",1.0
7678,../2016/Sudan.txt,SOME SUDANESE CITIZENS WHO MIGRATE TO EUROPE V...,0.0
379,../2003/Kyrgyz_Republic.txt,MEN AND WOMEN ARE TRAFFICKED TO KAZAKHSTAN FOR...,0.0
1739,../2007/Zimbabwe.txt,YOUNG WOMEN AND GIRLS ARE ALSO LURED TO SOUTH ...,0.0
33,../2001/Costa_Rica.txt,THERE ALSO HAVE BEEN REPORTS OF GIRLS FROM THE...,1.0
1637,../2007/Russia.txt,RUSSIA IS A SOURCE COUNTRY FOR MEN AND WOMEN T...,0.0


In [28]:
def supervisor(text):
    sentences = text.split('.')
    source = sentences[0].find('source') > -1
    destination = sentences[0].find('destination') > -1
    transit = sentences[0].find('transit') > -1
    if source and not destination and not transit:
        label = 0
    elif destination and not source and not transit:
        label = 1
    else:
        label = ' '
    return label
#add automatic labels
labels = []
fnames = []
for year in range(2001, 2016+1):
    for doc in glob.glob('../'+str(year)+'/*.txt'):
        #print(doc)
        contents=open(doc).read()
        fnames.append(doc)
        labels.append(supervisor(contents))
        

lazy_supervision = pd.DataFrame({'file' : fnames,
                              'label' : labels})     

In [29]:
temp = sentences.merge(lazy_supervision, on=['file'], how='left')
temp['label'] = pd.to_numeric(temp['label'], errors='coerce')

In [30]:
combined = temp.merge(clean_training, on=['file', 'text'], how='left')

In [31]:
combined.shape
combined['label'] = combined['label_x']
combined

Unnamed: 0,countries,country,entities,file,text,label_x,label_y,label
0,ROMANIA;NETHERLANDS;ITALY;MOLDOVA;GREECE;BELGIUM,ALBANIA,THE NETHERLANDS;ITALY;MOLDOVA;GREECE;ROMANIA;B...,../2001/Albania.txt,TRAFFICKING VICTIMS ARE MOSTLY WOMEN FROM ALBA...,,,
1,UKRAINE;ROMANIA;CZECH REPUBLIC;SLOVAKIA;HUNGAR...,AUSTRIA,UKRAINE;CZECH REPUBLIC;HUNGARY;ROMANIA;SLOVAKI...,../2001/Austria.txt,\nAUSTRIA IS A DESTINATION AND TRANSIT COUNTRY...,,,
2,RUSSIAN FEDERATION;ETHIOPIA;PHILIPPINES;BELARU...,BAHRAIN,BELARUS;THE PHILIPPINES;ETHIOPIA;RUSSIA;INDIA,../2001/Bahrain.txt,"WORKERS FROM THE PHILIPPINES, ETHIOPIA, INDIA,...",1.0,,1.0
3,PAKISTAN;INDIA,BANGLADESH,PAKISTAN;INDIA,../2001/Bangladesh.txt,SEVERAL THOUSAND WOMEN AND GIRLS ARE TRAFFICKE...,,,
4,UKRAINE;GERMANY;POLAND;LITHUANIA;RUSSIAN FEDER...,BELARUS,UKRAINE;LITHUANIA;GERMANY;RUSSIA;POLAND,../2001/Belarus.txt,YOUNG WOMEN ARE TRAFFICKED THROUGH BELARUS FRO...,,1.0,
5,NIGERIA;ALBANIA;CHINA,BELGIUM,ALBANIA;NIGERIA;CHINA,../2001/Belgium.txt,\nBELGIUM IS A TRANSIT AND DESTINATION COUNTRY...,,,
6,CHINA,BELGIUM,CHINA,../2001/Belgium.txt,VICTIMS ARE PRIMARILY YOUNG WOMEN TRAFFICKED F...,,,
7,GHANA;NIGERIA;GABON,BENIN,NIGERIA;GHANA;GABON,../2001/Benin.txt,"BENINESE CHILDREN ARE TRAFFICKED TO GHANA, NIG...",,,
8,TOGO;NIGER;BURKINA FASO,BENIN,TOGO;NIGER;BURKINA FASO,../2001/Benin.txt,"CHILDREN FROM NIGER, TOGO, AND BURKINA FASO HA...",,1.0,
9,UKRAINE;ROMANIA;BOSNIA AND HERZEGOVINA;MOLDOVA,BOSNIA-HERZEGOVINA,UKRAINE;MOLDOVA;BOSNIA;ROMANIA,../2001/Bosnia-Herzegovina.txt,\nBOSNIA IS A MAJOR DESTINATION AND TRANSIT CO...,,,


In [32]:
# Fix up the names for a couple of countries incorrectly parsed
combined = combined.loc[combined['file']!= '../2009/St._Vincent_And_The_Grenadines.txt']
combined.loc[combined['file'].str.contains('2009/Sri_Lanka.txt') & combined['countries'].str.contains('SUDAN|CHAD'), 'country'] = 'SUDAN'
combined = combined.drop(combined.loc[((combined['file']=='../2009/Sri_Lanka.txt') & (combined['countries']=='SUDAN'))].index)
combined.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(7788, 8)

In [33]:
combined = combined.drop(combined.loc[combined['text'].str.contains('AMERICAN INDIAN')].index)

In [34]:
combined.loc[combined.label_y == 1, 'label'] = 1

In [35]:
unlabelled = combined[(combined.label != 1) & (combined.label != 0)]
labelled = combined[(combined.label == 1) | (combined.label == 0)] 
labelled.shape, unlabelled.shape

((2096, 8), (5685, 8))

In [36]:
labelled
train, test = train_test_split(labelled, test_size=0.2, random_state=1253)

In [37]:
X_test = test['text']
y_test = np.array(test[['label']].values.flatten(), dtype="int")

X_train = train['text']
y_train = np.array(train[['label']].values.flatten(), dtype="int")

count_vect = CountVectorizer(decode_error='ignore', ngram_range=(1,2))
tfidf_transformer = TfidfTransformer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

print("There are {:,} words in the vocabulary.".format(len(count_vect.vocabulary_)))
print(X_train_counts.shape)
print(X_train_tfidf.shape)
print(len(X_train))
print(Counter(y_train))

X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

print(X_test_counts.shape)
print(X_test_tfidf.shape)
print(len(X_test))
print(Counter(y_test))


There are 17,093 words in the vocabulary.
(1676, 17093)
(1676, 17093)
1676
Counter({1: 915, 0: 761})
(420, 17093)
(420, 17093)
420
Counter({1: 249, 0: 171})


In [38]:
y_train = np.array(y_train,dtype='int')
y_train

array([1, 0, 1, ..., 1, 0, 0])

In [39]:
X_unlabelled_counts = count_vect.transform(unlabelled['text'])
print(X_unlabelled_counts.shape)

(5685, 17093)


### Estimate classification models

In [40]:
kf = KFold(n_splits=10, shuffle=True, random_state=1213)
f1_scorer = make_scorer(f1_score)
print(f1_scorer)

make_scorer(f1_score)


In [41]:
#logistic regression
grid = {
        'C': [1, .9, .8, .7, .6, .5, .25, .1],
        'penalty' : ['l1', 'l2']
    }
logit = LogisticRegression()
loggs = GridSearchCV(logit, grid, scoring=f1_scorer, cv=kf, n_jobs=-1)
loggs.fit(X_train_counts, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=1213, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.25, 0.1], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [42]:
#multinomial naive bayes with counts
grid = {
        'alpha': [1, .8,.6, .5, .4, .2]
    }
mnb = MultinomialNB()
mnbgs = GridSearchCV(mnb, grid, scoring=f1_scorer, cv=kf)
mnbgs.fit(X_train_counts, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=1213, shuffle=True),
       error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1, 0.8, 0.6, 0.5, 0.4, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [43]:
#SVM with counts
grid = {
        'C': [1, .9, .8, .7, .6, .5]
    }
svc = SVC(kernel='linear', probability=True)
svcgs = GridSearchCV(svc, grid, scoring=f1_scorer, cv=kf, n_jobs=-1)
svcgs.fit(X_train_counts, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=1213, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [1, 0.9, 0.8, 0.7, 0.6, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [44]:
# Random forest
grid = { "n_estimators"      : [1000, 1500, 2000],
           "max_depth"         : [10, 25, 50] }
rf = RandomForestClassifier()
rfgs = GridSearchCV(rf, grid, n_jobs=-1, cv=kf)
rfgs.fit(X_train_counts, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=1213, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [1000, 1500, 2000], 'max_depth': [10, 25, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [53]:
logpreds = loggs.predict(X_test_counts)
svcpreds = svcgs.predict(X_test_counts)
mnbpreds = mnbgs.predict(X_test_counts)
rfpreds = rfgs.predict(X_test_counts)

In [46]:
unlabpreds = loggs.predict(X_unlabelled_counts)
unlabelled['label'] = unlabpreds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [49]:
final = pd.concat([labelled, unlabelled], axis=0)

In [50]:
year = '\d{4}' 
final['year'] = final.file.str.extract('('+year+')', expand=False)
final

Unnamed: 0,countries,country,entities,file,text,label_x,label_y,label,year
2,RUSSIAN FEDERATION;ETHIOPIA;PHILIPPINES;BELARU...,BAHRAIN,BELARUS;THE PHILIPPINES;ETHIOPIA;RUSSIA;INDIA,../2001/Bahrain.txt,"WORKERS FROM THE PHILIPPINES, ETHIOPIA, INDIA,...",1.0,,1.0,2001
4,UKRAINE;GERMANY;POLAND;LITHUANIA;RUSSIAN FEDER...,BELARUS,UKRAINE;LITHUANIA;GERMANY;RUSSIA;POLAND,../2001/Belarus.txt,YOUNG WOMEN ARE TRAFFICKED THROUGH BELARUS FRO...,,1.0,1.0,2001
8,TOGO;NIGER;BURKINA FASO,BENIN,TOGO;NIGER;BURKINA FASO,../2001/Benin.txt,"CHILDREN FROM NIGER, TOGO, AND BURKINA FASO HA...",,1.0,1.0,2001
14,JAPAN;ISRAEL;UNITED STATES OF AMERICA,BRAZIL,JAPAN;ISRAEL;UNITED STATES,../2001/Brazil.txt,THE MAJORITY OF BRAZILIAN TRAFFICKING VICTIMS ...,0.0,,0.0,2001
18,MALI,BURKINA FASO,MALIAN,../2001/Burkina_Faso.txt,TRAFFICKED MALIAN CHILDREN ARE ALSO DESTINED F...,,1.0,1.0,2001
31,NETHERLANDS;SINGAPORE;UNITED STATES OF AMERICA...,COLOMBIA,THE NETHERLANDS;SINGAPORE;JAPAN;HONG KONG;SPAI...,../2001/Colombia.txt,\nCOLOMBIA IS A SOURCE COUNTRY FOR TRAFFICKED ...,0.0,,0.0,2001
33,PHILIPPINES,COSTA RICA,THE PHILIPPINES,../2001/Costa_Rica.txt,THERE ALSO HAVE BEEN REPORTS OF GIRLS FROM THE...,,1.0,1.0,2001
40,ISRAEL;NETHERLANDS;ITALY;CURACAO;ARUBA;BELGIUM...,DOMINICAN REPUBLIC,ANTIGUA;GERMANY;ITALY;GREECE;ISRAEL;ARGENTINA;...,../2001/Dominican_Republic.txt,ACCORDING TO THE CENTER FOR INTEGRAL ORIENTATI...,0.0,,0.0,2001
44,TOGO;BENIN,GABON,TOGO;BENIN,../2001/Gabon.txt,\nGABON IS A DESTINATION COUNTRY FOR TRAFFICKE...,1.0,,1.0,2001
48,TOGO;NIGERIA;COTE D'IVOIRE,GHANA,TOGO;NIGERIA;COTE D'IVOIRE,../2001/Ghana.txt,CHILDREN ARE TRAFFICKED TO AND FROM COTE D'IVO...,,1.0,1.0,2001


### Save predicted classes for country pairs

In [61]:
final = final[['year', 'country', 'countries', 'text', 'label']]
final.to_csv('../relation_extracction/final_classifier_upload.csv', index=False)