In [None]:
import httplib2
import oauth2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from gender_detector import GenderDetector 
import psycopg2, psycopg2.extras
from causalinference import CausalModel
from causalinference.utils import random_data
import httplib
import base64
import json # For Microsoft Face API
import urllib as urllib # For Microsoft Face API
import time 
import csv
import datetime 
from statsmodels.formula.api import ols
import statsmodels.api as sm

# Initialize arrays for Causal Analysis 
user_count = 1
bill_rate_array = []
gender_array = []
all_covariates_array = []

# Read data from csv files and reformat
all_data_file_name = './csv_files/2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data
gender_data_file_name = './csv_files/2017_12_12_upwork_face_data_unitedstates_allskills.csv' # Filename for gender data
df = pd.read_csv(all_data_file_name)
df_gender = pd.read_csv(gender_data_file_name)

# Merge the files, then remove rows with an error, no gender identified, or ambiguous education
merged = df.merge(df_gender, on='user_count')

total_merged = len(merged)
bill_rate_errors = len(merged[merged.bill_rate == 'error'])
education_errors = len(merged[merged.education == 'None'])
work_experience_errors = len(merged[merged.work_experience == 'error'])
#jobs_completed_errors = len(merged[merged.jobs_completed == 'error'])
job_category_errors = len(merged[merged.job_category == 'none'])
gender_errors = len(merged[merged.gender == 'error'])

print "Total merged: {0}".format(total_merged)
print "Bill rate errors: {0}".format(bill_rate_errors)
print "Education errors: {0}".format(education_errors)
print "Work experience errors: {0}".format(work_experience_errors)
#print "Jobs completed errors: {0}".format(jobs_completed_errors)
print "Jobs category errors: {0}".format(job_category_errors)
print "Gender errors: {0}".format(gender_errors)

merged = merged[merged.bill_rate != 'error']
print "Total after bill rate errors: {0}".format(len(merged))
merged = merged[merged.gender != 'error']
print "Total after gender errors: {0}".format(len(merged))
merged = merged[merged.education != 'None']
print "Total after education: {0}".format(len(merged))
merged = merged[merged.work_experience != 'error']
print "Total after work experience: {0}".format(len(merged))
#merged = merged[merged.jobs_completed != 'error']
#print "Total after jobs_completed: {0}".format(len(merged))
merged = merged[merged.job_category != 'none']
print "Total after job_category: {0}".format(len(merged))

# Calculate mean and standard deviation
merged['bill_rate'] = merged.bill_rate.astype('float')
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

# Remove outliers in the dataset
# merged = merged[merged['bill_rate'] > mean - 2 * sd]
# merged = merged[merged['bill_rate'] < mean + 2 * sd]

# Show the outliers IDs
'''
def showOutliers(data):
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    all_ids = outliers.worker_id_x
    all_billrates = outliers.bill_rate
    for person in all_ids:
        print person
        
showOutliers(merged)
'''

# Calculate mean and standard deviation
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

all_bill_rates = merged.bill_rate.astype('float')
merged['bill_rate'] = merged.bill_rate.astype('float')
merged['work_experience'] = merged.work_experience.astype('float')
all_work_experience = merged.work_experience
all_education_id = merged.education_id
all_age_range_id = merged.age_range_id
all_job_category_id = merged.job_category_id
all_genders = merged.gender
female_count = 0
male_count = 0

# Converting covariates to a matrix on a dichotomous scale
def make_dichotomous_matrix(id_value, covariate, final_matrix):
    for option in list(set(covariate)):
        if (id_value == option):
            final_matrix.append(1)
        else:
            final_matrix.append(0)
    return final_matrix
        
# Data formatting 
for gender in all_genders:
    if (gender == "male"):
        gender_array.append(0)
        male_count += 1
    elif (gender == "female"): # Female as the treatment group
        gender_array.append(1)
        female_count += 1

for rate in all_bill_rates:
    rate = round(float(rate), 2)
    bill_rate_array.append(rate)

for row in merged.itertuples():    
    job_category_matrix = []
    education_matrix = []
    age_range_id_matrix = []
    
    individual_covariate_matrix = []
    
    job_category_matrix = make_dichotomous_matrix(row.job_category_id, all_job_category_id, job_category_matrix)
    education_matrix = make_dichotomous_matrix(row.education_id, all_education_id, education_matrix)
    age_range_id_matrix = make_dichotomous_matrix(row.age_range_id, all_age_range_id, age_range_id_matrix)
    
    individual_covariate_matrix.extend(job_category_matrix)
    individual_covariate_matrix.extend(education_matrix)
    individual_covariate_matrix.extend(age_range_id_matrix)
    #individual_covariate_matrix.append(row.work_experience)
    all_covariates_array.append(individual_covariate_matrix)

'''
bill_rate_array = bill_rate_array[0:10000]
gender_array = gender_array[0:10000]
all_covariates_array = all_covariates_array[0:10000]
'''

# Check that arrays contain complete data
print "Bill rate array length: {0}".format(len(bill_rate_array))
print "Gender array length: {0}".format(len(gender_array))
print "Covariate array length: {0}".format(len(all_covariates_array))
print "Female count: {0}, Male count: {1}".format(female_count, male_count)


Total merged: 55518
Bill rate errors: 9588
Education errors: 12083
Work experience errors: 9631
Jobs completed errors: 9588
Jobs category errors: 446
Gender errors: 3457
Total after bill rate errors: 45930
Total after gender errors: 43013
Total after education: 31875
Total after work experience: 31853
Total after job_category: 31717
Bill rate average: 39.8293404168
Bill rate standard deviation: 37.9426751272
Bill rate average: 39.8293404168
Bill rate standard deviation: 37.9426751272


In [37]:
Y = np.array(bill_rate_array)
D = np.array(gender_array)
X = np.array(all_covariates_array)

#np.seterr(divide='ignore', invalid='ignore')

causal = CausalModel(Y, D, X)

print(causal.summary_stats)


Summary Statistics

                     Controls (N_c=16638)       Treated (N_t=15079)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y       41.605       37.358       37.870       38.485       -3.735

                     Controls (N_c=16638)       Treated (N_t=15079)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.030        0.170        0.038        0.191        0.044
             X1        0.074        0.262        0.076        0.265        0.006
             X2        0.030        0.172        0.022        0.148       -0.051
             X3        0.196        0.397        0.232        0.422        0.088
             X4        0.007        0.082        0.007        0.084        0.004
      

In [38]:
# Matching each person in the dataset with a comparable person in the dataset

causal.est_via_matching()
print(causal.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.573      0.438     -3.589      0.000     -2.432     -0.714
           ATC     -1.961      0.454     -4.325      0.000     -2.850     -1.073
           ATT     -1.145      0.459     -2.492      0.013     -2.045     -0.244



In [39]:
causal.est_propensity_s()
print (causal.propensity)


Estimated Parameters of Propensity Score

                    Coef.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
     Intercept     -0.588      0.026    -22.452      0.000     -0.639     -0.536
           X18      1.372      0.046     29.711      0.000      1.281      1.462
           X21      0.503      0.026     19.623      0.000      0.452      0.553
           X19      1.395      0.092     15.243      0.000      1.216      1.575
            X8     -0.203      0.039     -5.254      0.000     -0.279     -0.127
           X10      0.301      0.039      7.694      0.000      0.224      0.378
            X3      0.234      0.032      7.276      0.000      0.171      0.297
            X0      0.165      0.070      2.357      0.018      0.028      0.303
           X22     -0.431      0.105     -4.111      0.000     -0.637     -0.226
           X11      0.218      0.081      2.701      0.007      0.

In [40]:
# Trimming

causal.trim_s()
causal.cutoff
print (causal.summary_stats)


Summary Statistics

                     Controls (N_c=16617)       Treated (N_t=14921)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y       41.608       37.338       38.012       38.642       -3.597

                     Controls (N_c=16617)       Treated (N_t=14921)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.029        0.167        0.029        0.168        0.002
             X1        0.074        0.262        0.077        0.266        0.008
             X2        0.030        0.172        0.023        0.148       -0.049
             X3        0.197        0.397        0.235        0.424        0.093
             X4        0.007        0.083        0.007        0.085        0.005
      

In [41]:
causal.est_via_ols()
causal.est_via_weighting()
causal.est_via_matching(bias_adj=True)
print(causal.estimates)


Treatment Effect Estimates: Weighting

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.524      0.418     -3.646      0.000     -2.344     -0.705

Treatment Effect Estimates: OLS

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.584      0.419     -3.784      0.000     -2.405     -0.764
           ATC     -1.920      0.441     -4.350      0.000     -2.785     -1.055
           ATT     -1.217      0.413     -2.949      0.003     -2.026     -0.408

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.533      0.438     -3.502      0.000     -2.391     -0.