# FinalProject_Part4_Model Draft 2 (Demo vs Exploit Type)

## 2) Multinomial Logistic Regression (Input: Demographics; Output: Exploitation Type)

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import random
import matplotlib as plt
import pdb

In [4]:
#Read Cleaned Global Data
GD_data_final = pd.read_csv('assets/GD_data_final.csv')
GD_data_final.head()

Unnamed: 0,yearOfRegistration,gender,majorityStatus,ageBroad_mid,citizenshipRegion,citizenshipGeoCategory,exploitationRegion,exploitationGeoCategory,mocDebtBondage,mocTakesEarnings,...,tosProstitution,tosPornography,tosRemoteInteractiveServices,tosPrivateSexualServices,tosNotSpecified,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown
0,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,0,0,...,0,0,0,0,1,0,0,0,0,1
1,2004,Female,Adult,19.0,Eastern_Europe,Europe,Eastern_Europe,Europe,0,0,...,0,0,0,0,1,0,0,0,0,1
2,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
3,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
4,2012,Female,Minor,13.0,North_America,North_America,North_America,North_America,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Read Demographic vs Exploitation Geo Data
demo_exptype_calc = pd.read_csv('assets/demo_exptype_calc.csv')
demo_exptype_calc.head()

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,cg_North_America,cg_South_America,cg_Unknown,isForcedLabour,isSexualExploit,isOtherExploit,isSexAndLabour,isForcedMarriage,isForcedMilitary,isOrganRemoval
0,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2004,19.0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,2012,13.0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [8]:
### Since Exploitation Type are not comletely mutually exclusive (i.e. One can be both isSexualExploit and isForcedLabour, 
### and thus isSexAndLabour), change those rows to just 1 for isSexAndLabour - nothing else

def nullify_Sex(x):
    if x['isSexAndLabour'] == 1:
        return 0
    else:
        return x['isSexualExploit']

def nullify_Labor(x):
    if x['isSexAndLabour'] == 1:
        return 0
    else:
        return x['isForcedLabour']
    
demo_exptype_calc['isSexualExploit'] = demo_exptype_calc.apply(nullify_Sex,axis=1)
demo_exptype_calc['isForcedLabour'] = demo_exptype_calc.apply(nullify_Labor,axis=1)

In [9]:
GD_data_final.columns

Index([u'yearOfRegistration', u'gender', u'majorityStatus', u'ageBroad_mid',
       u'citizenshipRegion', u'citizenshipGeoCategory', u'exploitationRegion',
       u'exploitationGeoCategory', u'mocDebtBondage', u'mocTakesEarnings',
       u'mocRestrictsFinancialAccess', u'mocThreats', u'mocPsychologicalAbuse',
       u'mocPhysicalAbuse', u'mocSexualAbuse', u'mocFalsePromises',
       u'mocPsychoactiveSubstances', u'mocRestrictsMovement',
       u'mocRestrictsMedicalCare', u'mocExcessiveWorkingHours',
       u'mocUsesChildren', u'mocThreatOfLawEnforcement',
       u'mocWithholdsNecessities', u'mocWithholdsDocuments', u'mocOther',
       u'mocNotSpecified', u'isForcedLabour', u'isSexualExploit',
       u'isOtherExploit', u'isSexAndLabour', u'isForcedMarriage',
       u'isForcedMilitary', u'isOrganRemoval', u'tolAgriculture',
       u'tolAquafarming', u'tolBegging', u'tolConstruction',
       u'tolDomesticWork', u'tolHospitality', u'tolIllicitActivities',
       u'tolManufacturing', u'tolM

In [13]:
### Consolidate all types of exploitation into one column

def combine_expType(x):
    if x['isForcedLabour'] == 1:
        return 'Forced Labour'
    elif x['isSexualExploit'] == 1:
        return 'Sexual Exploit'
    elif x['isOtherExploit'] == 1:
        return 'Other Exploit'
    elif x['isSexAndLabour'] == 1:
        return 'Sex and Labour'
    elif x['isForcedMarriage'] == 1:
        return 'Forced Marriage'
    elif x['isForcedMilitary'] == 1:
        return 'Forced Military'
    elif x['isOrganRemoval'] == 1:
        return 'Organ Removal'
    else:
        return 0
        
demo_exptype_calc['ExploitType'] = demo_exptype_calc.apply(combine_expType,axis=1)
demo_exptype_calc.head()

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,cg_South_America,cg_Unknown,isForcedLabour,isSexualExploit,isOtherExploit,isSexAndLabour,isForcedMarriage,isForcedMilitary,isOrganRemoval,ExploitType
0,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0.0,1.0,0,0,0,0,0,Sexual Exploit
1,2004,19.0,1,0,1,0,0,0,1,0,...,0,0,0.0,1.0,0,0,0,0,0,Sexual Exploit
2,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0.0,1.0,0,0,0,0,0,Sexual Exploit
3,2010,19.0,1,0,1,1,0,0,0,0,...,0,0,0.0,1.0,0,0,0,0,0,Sexual Exploit
4,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,1.0,0.0,0,0,0,0,0,Forced Labour


In [16]:
demo_exptype_calc.columns

Index([u'yearOfRegistration', u'ageBroad_mid', u'g_Female', u'g_Male',
       u'ms_Adult', u'cr_Central_Asia', u'cr_East_Africa', u'cr_East_Asia',
       u'cr_Eastern_Europe', u'cr_Middle_East', u'cr_North_America',
       u'cr_Northeast_Africa', u'cr_South_America', u'cr_South_Asia',
       u'cr_Southeast_Asia', u'cr_Southeastern_Europe', u'cr_Unknown',
       u'cr_West_Africa', u'cg_Africa', u'cg_Asia', u'cg_Europe',
       u'cg_Middle_East', u'cg_North_America', u'cg_South_America',
       u'cg_Unknown', u'isForcedLabour', u'isSexualExploit', u'isOtherExploit',
       u'isSexAndLabour', u'isForcedMarriage', u'isForcedMilitary',
       u'isOrganRemoval', u'ExploitType'],
      dtype='object')

In [21]:
     demo_exptype_headers = ['yearOfRegistration','ageBroad_mid', 'g_Female', 'ms_Adult', 
                              'cr_Central_Asia', 'cr_East_Africa', 'cr_East_Asia', 'cr_Eastern_Europe', 'cr_Middle_East', 
                              'cr_North_America', 'cr_Northeast_Africa', 'cr_South_America', 'cr_South_Asia', 
                              'cr_Southeast_Asia', 'cr_Southeastern_Europe', 'cr_West_Africa', 
                              'cg_Africa', 'cg_Asia', 'cg_Europe', 'cg_Middle_East', 'cg_North_America', 'cg_South_America', 
                              'ExploitType']
demo_exptype_calc[demo_exptype_headers[:25]]

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,cr_North_America,...,cr_Southeast_Asia,cr_Southeastern_Europe,cr_West_Africa,cg_Africa,cg_Asia,cg_Europe,cg_Middle_East,cg_North_America,cg_South_America,ExploitType
0,2010,19.0,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Sexual Exploit
1,2004,19.0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,Sexual Exploit
2,2010,19.0,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Sexual Exploit
3,2010,19.0,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Sexual Exploit
4,2012,13.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour
5,2012,13.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour
6,2012,13.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour
7,2012,13.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour
8,2012,13.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour
9,2012,13.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,Forced Labour


In [37]:
# Multinomial Logistic Regression
def demo_exptype_main():
    ### Including all demo features (Can be modified and observe score changes):
    #demo_exptype_headers = ['yearOfRegistration', 'ageBroad_mid', 'g_Female', 'g_Male', 'ms_Adult', 
                              #'cr_Central_Asia', 'cr_East_Africa', 'cr_East_Asia', 'cr_Eastern_Europe', 'cr_Middle_East', 
                              #'cr_North_America', 'cr_Northeast_Africa', 'cr_South_America', 'cr_South_Asia', 
                              #'cr_Southeast_Asia', 'cr_Southeastern_Europe', 'cr_Unknown', 'cr_West_Africa', 
                              #'cg_Africa', 'cg_Asia', 'cg_Europe', 'cg_Middle_East', 'cg_North_America', 'cg_South_America', 
                              #'cg_Unknown','ExploitType']
    demo_exptype_headers = ['g_Male',
                              'ExploitType']
    #'cg_Africa', 'cg_Asia', 'cg_Europe', 'cg_Middle_East', 'cg_North_America', 'cg_South_America',
    #'ageBroad_mid','g_Female', 'ms_Adult', 'cg_Africa', 'cg_Asia', 'cg_Europe', 'cg_Middle_East', 'cg_North_America', 'cg_South_America', 
    
    ### Train Test Split - Out put col index -1 
    X = demo_exptype_calc[demo_exptype_headers[:-1]]
    y = demo_exptype_calc[demo_exptype_headers[-1]]
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7)
    
    # USE TWO DIFFERENT APPROACHES TO TRAIN 
    # Train multi-classification model with logistic regression
    lr = LogisticRegression()
    lr.fit(train_x, train_y)
 
    # Train multinomial logistic regression model
    mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    mul_lr.fit(train_x, train_y)
 
    print "Logistic regression Train Accuracy :: ", metrics.accuracy_score(train_y, lr.predict(train_x))
    print "Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, lr.predict(test_x))
 
    print "Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(train_y, mul_lr.predict(train_x))
    print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x))
    
    print ("Cross validating...")

    print ("Multi-classification Log Reg - train accuracy: ")
    print (np.mean(cross_val_score(lr, train_x, train_y.values, scoring='accuracy', cv=10)))
    print ("Multinomial Log Reg - train accuracy: ")
    print (np.mean(cross_val_score(mul_lr, train_x, train_y.values, scoring='accuracy', cv=10)))
    #print ("precision: ")
    #print (np.mean(cross_val_score(model_demo_expgeo, train_x, train_y.values, scoring='precision', cv=10)))
    #print ("recall: ")
    #print (np.mean(cross_val_score(model_demo_expgeo, train_x, train_y.values, scoring='recall', cv=10)))
    #print ("f1: ")
    #print (np.mean(cross_val_score(model_demo_expgeo, train_x, train_y.values, scoring='f1', cv=10)))
    #print ("roc_auc: ")
    #print (np.mean(cross_val_score(model_demo_expgeo, train_x, train_y.values, scoring='roc_auc', cv=10)))
    print (mul_lr.get_params())
    print (mul_lr.predict(1))
    
if __name__ == '__main__':
  demo_exptype_main()


Logistic regression Train Accuracy ::  0.825398052249
Logistic regression Test Accuracy ::  0.831410025243
Multinomial Logistic regression Train Accuracy ::  0.825398052249
Multinomial Logistic regression Test Accuracy ::  0.831410025243
Cross validating...
Multi-classification Log Reg - train accuracy: 
0.825392985173
Multinomial Log Reg - train accuracy: 
0.825392985173
{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 0, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'multinomial', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'newton-cg', 'class_weight': None}
['Forced Labour']


### Findings:
Cross Val Score - Accuracy: (Generally between 0.87-0.88)
- Include all features = 0.879
- All except yearOfRegistration = 0.874
- All except yearOfRegistration, g_Male, cr_Unknown, cg_Unknown = 0.878 (remove possibly multicollinearity)

## Observations: Accuracy Scores

### All Demo data with Citizenship Region --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.930669346112
Multinomial Logistic regression Test Accuracy ::  0.92589253516

### All Demo data with Citizenship Geo Category --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.922553717731
Multinomial Logistic regression Test Accuracy ::  0.923007573026

### yearOfRegistration --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.60071108363
Multinomial Logistic regression Test Accuracy ::  0.602596465921

### ageBroad_mid --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.75614469006
Multinomial Logistic regression Test Accuracy ::  0.753876667869

### g_Female --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.829262637193
Multinomial Logistic regression Test Accuracy ::  0.822394518572
Male --> Forced Labour
Female --> Sexual Exploit

### ms_Adult --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.601870459113
Multinomial Logistic regression Test Accuracy ::  0.59989181392

### citizenship Regions --> Exploit Type:
Multinomial Logistic regression Train Accuracy ::  0.816818673674
Multinomial Logistic regression Test Accuracy ::  0.816444284169

Conclusion: Demographics have strong predictive power against Exploitation Type.  Most predictive feature is Citizenship Region, followed by Gender and Age.
