# Final Project - Part 4A - Model Draft (Ellein Cheng)

## 4A) Mutinomial Logistic Regression (Input: Demographics & Recruiter Relation; Output: Type of Labor)

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
import random
import matplotlib as plt
import pdb



In [3]:
#Read Cleaned Global Data
GD_data_final = pd.read_csv('assets/GD_data_final.csv')
GD_data_final.head()

Unnamed: 0,yearOfRegistration,gender,majorityStatus,ageBroad_mid,citizenshipRegion,citizenshipGeoCategory,exploitationRegion,exploitationGeoCategory,mocDebtBondage,mocTakesEarnings,...,tosProstitution,tosPornography,tosRemoteInteractiveServices,tosPrivateSexualServices,tosNotSpecified,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown
0,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,0,0,...,0,0,0,0,1,0,0,0,0,1
1,2004,Female,Adult,19.0,Eastern_Europe,Europe,Eastern_Europe,Europe,0,0,...,0,0,0,0,1,0,0,0,0,1
2,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
3,2010,Female,Adult,19.0,Central_Asia,Asia,Central_Asia,Asia,1,0,...,0,0,0,0,1,0,0,0,0,1
4,2012,Female,Minor,13.0,North_America,North_America,North_America,North_America,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
GD_data_final.columns

Index([u'yearOfRegistration', u'gender', u'majorityStatus', u'ageBroad_mid',
       u'citizenshipRegion', u'citizenshipGeoCategory', u'exploitationRegion',
       u'exploitationGeoCategory', u'mocDebtBondage', u'mocTakesEarnings',
       u'mocRestrictsFinancialAccess', u'mocThreats', u'mocPsychologicalAbuse',
       u'mocPhysicalAbuse', u'mocSexualAbuse', u'mocFalsePromises',
       u'mocPsychoactiveSubstances', u'mocRestrictsMovement',
       u'mocRestrictsMedicalCare', u'mocExcessiveWorkingHours',
       u'mocUsesChildren', u'mocThreatOfLawEnforcement',
       u'mocWithholdsNecessities', u'mocWithholdsDocuments', u'mocOther',
       u'mocNotSpecified', u'isForcedLabour', u'isSexualExploit',
       u'isOtherExploit', u'isSexAndLabour', u'isForcedMarriage',
       u'isForcedMilitary', u'isOrganRemoval', u'tolAgriculture',
       u'tolAquafarming', u'tolBegging', u'tolConstruction',
       u'tolDomesticWork', u'tolHospitality', u'tolIllicitActivities',
       u'tolManufacturing', u'tolM

In [5]:
# Read Demographic vs Labor Type Data - Only for isForcedLabour == 1
demo_labortype_calc = pd.read_csv('assets/demo_labortype_calc.csv')
demo_labortype_calc.reset_index(drop=True)
demo_labortype_calc.head()

Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,tolConstruction,tolDomesticWork,tolHospitality,tolIllicitActivities,tolManufacturing,tolMiningOrDrilling,tolPeddling,tolTransportation,tolOther,tolNotSpecified
0,2012,13.0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2012,13.0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2012,13.0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2012,13.0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
demo_labortype_calc.columns

Index([u'yearOfRegistration', u'ageBroad_mid', u'g_Female', u'g_Male',
       u'ms_Adult', u'cr_Central_Asia', u'cr_East_Africa', u'cr_East_Asia',
       u'cr_Eastern_Europe', u'cr_Middle_East', u'cr_North_America',
       u'cr_Northeast_Africa', u'cr_South_America', u'cr_South_Asia',
       u'cr_Southeast_Asia', u'cr_Southeastern_Europe', u'cr_Unknown',
       u'cr_West_Africa', u'cg_Africa', u'cg_Asia', u'cg_Europe',
       u'cg_Middle_East', u'cg_North_America', u'cg_South_America',
       u'cg_Unknown', u'tolAgriculture', u'tolAquafarming', u'tolBegging',
       u'tolConstruction', u'tolDomesticWork', u'tolHospitality',
       u'tolIllicitActivities', u'tolManufacturing', u'tolMiningOrDrilling',
       u'tolPeddling', u'tolTransportation', u'tolOther', u'tolNotSpecified'],
      dtype='object')

In [7]:
# Carve out Just Recruiter Relation data
GD_data_Labor = GD_data_final[GD_data_final['isForcedLabour']==1].reset_index(drop=True)
rr_Labor_handCalc = GD_data_Labor[['rrIntimatePartner','rrFriend','rrFamily','rrOther','rrUnknown']]
rr_Labor_handCalc.reset_index(drop=True)
rr_Labor_handCalc.head()
rr_test = rr_Labor_handCalc

In [8]:
## JOIN Demographics, Means of Control, and Recruiter Relation data
#demo_rr_labortype_calc = demo_labortype_calc.join(rr_Labor_handCalc)
demo_rr_Labortype_calc = pd.concat([demo_labortype_calc,rr_test],axis=1)
demo_rr_Labortype_calc.head()
len(demo_rr_Labortype_calc)

6750

In [9]:
### Consolidate all types of labor into one column

def combine_laborType(x):
    if x['tolAgriculture'] == 1:
        return 'Agriculture'
    elif x['tolAquafarming'] == 1:
        return 'Aquafarming'
    elif x['tolBegging'] == 1:
        return 'Begging'
    elif x['tolConstruction'] == 1:
        return 'Construction'
    elif x['tolDomesticWork'] == 1:
        return 'DomesticWork'
    elif x['tolHospitality'] == 1:
        return 'Hospitality'
    elif x['tolIllicitActivities'] == 1:
        return 'IllicitActivities'
    elif x['tolManufacturing'] == 1:
        return 'Manufacturing'
    elif x['tolMiningOrDrilling'] == 1:
        return 'MiningOrDrilling'
    elif x['tolPeddling'] == 1:
        return 'Peddling'
    elif x['tolTransportation'] == 1:
        return 'Transportation'
    elif x['tolOther'] == 1:
        return 'Other'
    elif x['tolNotSpecified'] == 1:
        return 'NotSpecified'
    else:
        return 0
        
demo_rr_Labortype_calc['LaborType'] = demo_rr_Labortype_calc.apply(combine_laborType,axis=1)
demo_rr_Labortype_calc.head()


Unnamed: 0,yearOfRegistration,ageBroad_mid,g_Female,g_Male,ms_Adult,cr_Central_Asia,cr_East_Africa,cr_East_Asia,cr_Eastern_Europe,cr_Middle_East,...,tolPeddling,tolTransportation,tolOther,tolNotSpecified,rrIntimatePartner,rrFriend,rrFamily,rrOther,rrUnknown,LaborType
0,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,DomesticWork
1,2012,13.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,DomesticWork
2,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,DomesticWork
3,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,DomesticWork
4,2012,13.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,NotSpecified


In [10]:
demo_rr_Labortype_calc.iloc[4724:4727,18:21]

Unnamed: 0,cg_Africa,cg_Asia,cg_Europe
4724,0,1,0
4725,0,1,0
4726,0,1,0


In [12]:
# Try OneVsRestClassifier - Multilabel classification method
def main():
    headers = ['ageBroad_mid','g_Female', 
                              'cr_Central_Asia', 'cr_East_Africa', 'cr_East_Asia', 'cr_Eastern_Europe', 'cr_Middle_East', 
                              'cr_North_America', 'cr_Northeast_Africa', 'cr_South_America', 'cr_South_Asia', 
                              'cr_Southeast_Asia', 'cr_Southeastern_Europe', 'cr_West_Africa', 
                                'rrIntimatePartner', 'rrFriend', 'rrFamily', 'rrOther', 
                                 'LaborType']
    
    ### Train Test Split - Out put col index -1 
    X = demo_rr_Labortype_calc[headers[:-1]]
    y = demo_rr_Labortype_calc[headers[-1]]
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7)
    
    # Train multinomial logistic regression model
    mul_lr = LogisticRegression(C=10,multi_class='multinomial', solver='newton-cg')
    mul_lr.fit(train_x, train_y)

 
    print "Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(train_y, mul_lr.predict(train_x))
    print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x))
    
    print ("Cross validating...")

    print ("Multinomial Log Reg - train accuracy: ")
    print (np.mean(cross_val_score(mul_lr, train_x, train_y.values, scoring='accuracy', cv=10)))
    
    #region_unit = np.identity(12)
    #TEST  AGE-GENDER COMBO:
    ag = np.array([[10,1],[10,0],[22,1],[22,0],[40,1],[40,0]])
    
    print ("Classes: ")
    print (mul_lr.classes_)
    pred_y = mul_lr.predict(test_x)

    def cm2df(cm, labels):
        df = pd.DataFrame()
        # rows
        for i, row_label in enumerate(labels):
            rowdata={}
            # columns
            for j, col_label in enumerate(labels): 
                rowdata[col_label]=cm[i,j]
            df = df.append(pd.DataFrame.from_dict({row_label:rowdata}, orient='index'))
        return df[labels]

    ## to use, first generate confusion matrix:
    print ("Confusion Matrix: ")
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(test_y, pred_y)
    target = ['Agriculture', 'Aquafarming', 'Begging', 'Construction', 'DomesticWork',
 'Hospitality', 'Manufacturing', 'NotSpecified', 'Other', 'Peddling']
    ## then convert to pandas DataFrame:
    cm_as_df3=cm2df(cm,target)

    
    
    
if __name__ == '__main__':
  main()

Multinomial Logistic regression Train Accuracy ::  0.77037037037
Multinomial Logistic regression Test Accuracy ::  0.75950617284
Cross validating...
Multinomial Log Reg - train accuracy: 




0.766793062191
Classes: 
['Agriculture' 'Aquafarming' 'Begging' 'Construction' 'DomesticWork'
 'Hospitality' 'Manufacturing' 'NotSpecified' 'Other' 'Peddling']
Confusion Matrix: 


## Observations: Accuracy Scores - Predict Labor Type

### All Demo & RR features with Citizenship Region:
Multinomial Logistic regression Train Accuracy ::  0.781164021164
Multinomial Logistic regression Test Accuracy ::  0.781234567901


### ageBroad_mid :
Multinomial Logistic regression Train Accuracy ::  0.635767195767
Multinomial Logistic regression Test Accuracy ::  0.64

### g_Female:
Multinomial Logistic regression Train Accuracy ::  0.645079365079
Multinomial Logistic regression Test Accuracy ::  0.6454320987656

### ms_Adult:
Multinomial Logistic regression Train Accuracy ::  0.646984126984
Multinomial Logistic regression Test Accuracy ::  0.640987654321

### citizenship Regions :
Multinomial Logistic regression Train Accuracy ::  0.696296296296
Multinomial Logistic regression Test Accuracy ::  0.682469135802

### recruiter relation :
Multinomial Logistic regression Train Accuracy ::  0.675132275132
Multinomial Logistic regression Test Accuracy ::  0.705185185185

Conclusions?  
Recruiter Relation - Family --> Begging
Central Asia - Contruction
East Africa - Begging
Eastern Europe - Construction
South Asia - Domestic Work
Age/Gender - not strong indicators

