# Final Project - Part 4 - Model Draft 3A (Demo+RR vs Means of Control)

## 3A) OneVsRestClassifier (Input: Demographics & Recruiter Relation; Output: Means Of Control)

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
import random
import matplotlib as plt
import pdb



In [3]:
#Read Cleaned Global Data
GD_data_final = pd.read_csv('assets/GD_data_final.csv')
GD_data_final.head()

Unnamed: 0,yearOfRegistration,gender,majorityStatus,ageBroad_mid,citizenshipRegion,citizenshipGeoCategory,exploitationRegion,exploitationGeoCategory,mocDebtBondage,mocTakesEarnings,...,tosProstitution,tosPornography,tosRemoteInteractiveServices,tosPrivateSexualServices,tosNotSpecified,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown
0,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,0,0,...,0,0,0,0,1,0,0,0,0,1
1,2004,Female,Adult,19.0,Eastern_Europe,Europe,Eastern_Europe,Europe,0,0,...,0,0,0,0,1,0,0,0,0,1
2,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
3,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
4,2012,Female,Minor,13.0,North_America,North_America,North_America,North_America,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
GD_data_final.columns

Index([u'yearOfRegistration', u'gender', u'majorityStatus', u'ageBroad_mid',
       u'citizenshipRegion', u'citizenshipGeoCategory', u'exploitationRegion',
       u'exploitationGeoCategory', u'mocDebtBondage', u'mocTakesEarnings',
       u'mocRestrictsFinancialAccess', u'mocThreats', u'mocPsychologicalAbuse',
       u'mocPhysicalAbuse', u'mocSexualAbuse', u'mocFalsePromises',
       u'mocPsychoactiveSubstances', u'mocRestrictsMovement',
       u'mocRestrictsMedicalCare', u'mocExcessiveWorkingHours',
       u'mocUsesChildren', u'mocThreatOfLawEnforcement',
       u'mocWithholdsNecessities', u'mocWithholdsDocuments', u'mocOther',
       u'mocNotSpecified', u'isForcedLabour', u'isSexualExploit',
       u'isOtherExploit', u'isSexAndLabour', u'isForcedMarriage',
       u'isForcedMilitary', u'isOrganRemoval', u'tolAgriculture',
       u'tolAquafarming', u'tolBegging', u'tolConstruction',
       u'tolDomesticWork', u'tolHospitality', u'tolIllicitActivities',
       u'tolManufacturing', u'tolM

In [5]:
# Read Demographic vs Means of Control Data
demo_moc_calc = pd.read_csv('assets/demo_moc_calc.csv')
demo_moc_calc.head()
# len(demo_moc_calc) = 18484

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,mocPsychoactiveSubstances,mocRestrictsMovement,mocRestrictsMedicalCare,mocExcessiveWorkingHours,mocUsesChildren,mocThreatOfLawEnforcement,mocWithholdsNecessities,mocWithholdsDocuments,mocOther,mocNotSpecified
0,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004,19.0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [6]:
# Carve out Just Recruiter Relation data
rr_handCalc = GD_data_final[['rrIntimatePartner','rrFriend','rrFamily','rrOther','rrUnknown']]
rr_handCalc.head(3)

Unnamed: 0,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1


In [7]:
## JOIN Demographics, Means of Control, and Recruiter Relation data
demo_rr_moc_calc = demo_moc_calc.join(rr_handCalc)
demo_rr_moc_calc.head(3)

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,mocThreatOfLawEnforcement,mocWithholdsNecessities,mocWithholdsDocuments,mocOther,mocNotSpecified,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown
0,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2004,19.0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
# Try OneVsRestClassifier - Multilabel classification method
def demo_rr_moc_main():

    demo_rr_moc_headers = ['ms_Adult', 'g_Female','ageBroad_mid',
                              'cr_Central_Asia', 'cr_East_Africa', 'cr_East_Asia', 'cr_Eastern_Europe', 'cr_Middle_East', 
                              'cr_North_America', 'cr_Northeast_Africa', 'cr_South_America', 'cr_South_Asia', 
                              'cr_Southeast_Asia', 'cr_Southeastern_Europe', 'cr_West_Africa', 
                              'cg_Africa', 'cg_Asia', 'cg_Europe', 'cg_Middle_East', 'cg_North_America', 'cg_South_America', 
                                'rrIntimatePartner', 'rrFriend', 'rrFamily', 'rrOther', 
                                'mocDebtBondage', 'mocTakesEarnings',
                               'mocRestrictsFinancialAccess', 'mocThreats', 'mocPsychologicalAbuse',
                               'mocPhysicalAbuse', 'mocSexualAbuse', 'mocFalsePromises',
                               'mocPsychoactiveSubstances', 'mocRestrictsMovement',
                               'mocRestrictsMedicalCare', 'mocExcessiveWorkingHours',
                               'mocUsesChildren', 'mocThreatOfLawEnforcement',
                               'mocWithholdsNecessities', 'mocWithholdsDocuments', 'mocOther',
                               'mocNotSpecified']
    
    ### Train Test Split - Out put col index -1 
    X = demo_rr_moc_calc[demo_rr_moc_headers[:-18]]
    y = demo_rr_moc_calc[demo_rr_moc_headers[-18:-1]]
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7)
    
    # Train OneVsRestClassifier model
    ovr = OneVsRestClassifier(LogisticRegression(C=10))
    ovr.fit(train_x, train_y)
    
    print "OneVsRestClassifier Train Accuracy :: ", metrics.accuracy_score(train_y, ovr.predict(train_x))
    print "OneVsRestClassifier Test Accuracy :: ", metrics.accuracy_score(test_y, ovr.predict(test_x))
    
    ## Cross validation only works for Accuracy and not the other measures because it's for multilabeling
    print ("Cross validating...")
    print ("One-Vs-Rest - train accuracy: ")
    print (np.mean(cross_val_score(ovr, train_x, train_y.values, scoring='accuracy', cv=10)))
    print ("One-Vs-Rest - train precision: ")
    
    
if __name__ == '__main__':
  demo_rr_moc_main()

OneVsRestClassifier Train Accuracy ::  0.628381511826
OneVsRestClassifier Test Accuracy ::  0.636314460873
Cross validating...
One-Vs-Rest - train accuracy: 
0.62737914654
One-Vs-Rest - train precision: 


## Observations: Accuracy Scores

### All Demo & RR features with Citizenship Region --> Means Of Control:
OneVsRestClassifier Train Accuracy ::  0.6301592209
OneVsRestClassifier Test Accuracy ::  0.627118644068


### ageBroad_mid --> Means Of Control:
OneVsRestClassifier Train Accuracy ::  0.610759004483
OneVsRestClassifier Test Accuracy ::  0.601153984854

### g_Female --> Means Of Control:
OneVsRestClassifier Train Accuracy ::  0.614623589426
OneVsRestClassifier Test Accuracy ::  0.617562206996

### ms_Adult --> Means Of Control:
OneVsRestClassifier Train Accuracy ::  0.614932756222
OneVsRestClassifier Test Accuracy ::  0.616840966462

### citizenship Regions --> Means Of Control:
OneVsRestClassifier Train Accuracy ::  0.615860256608
OneVsRestClassifier Test Accuracy ::  0.61305445366

### Recruiter Relation --> Means Of Control:


Conclusions?  
If feeding REgion: "Central Asia" - Means of Control include - Takes Earnings, Threats, Psychological Abuse, False Promises, Restricts Movement, Restricts Medical Care, without Necessities

If feeding in RR: "Intimate Partner" - Means of Control - Psychological Abuse

Bad Data on Means of Control - not indicative
