In [3]:
import httplib2
import oauth2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from gender_detector import GenderDetector 
import psycopg2, psycopg2.extras
from causalinference import CausalModel
from causalinference.utils import random_data
import httplib
import base64
import json # For Microsoft Face API
import urllib as urllib # For Microsoft Face API
import time 
import csv
import datetime 
from statsmodels.formula.api import ols
import statsmodels.api as sm

# Initialize arrays for Causal Analysis 
user_count = 1
bill_rate_array = []
gender_array = []
all_covariates_array = []

# Read data from csv files and reformat
all_data_file_name = './csv_files/2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data
gender_data_file_name = './csv_files/2017_12_12_upwork_face_data_unitedstates_allskills.csv' # Filename for gender data
df = pd.read_csv(all_data_file_name)
df_gender = pd.read_csv(gender_data_file_name)

# Merge the files, then remove rows with an error, no gender identified, or ambiguous education
merged = df.merge(df_gender, on='user_count')

total_merged = len(merged)
bill_rate_errors = len(merged[merged.bill_rate == 'error'])
education_errors = len(merged[merged.education == 'None'])
work_experience_errors = len(merged[merged.work_experience == 'error'])
jobs_completed_errors = len(merged[merged.jobs_completed == 'error'])
job_category_errors = len(merged[merged.job_category == 'none'])
gender_errors = len(merged[merged.gender == 'error'])

print "Total merged: {0}".format(total_merged)
print "Bill rate errors: {0}".format(bill_rate_errors)
print "Education errors: {0}".format(education_errors)
print "Work experience errors: {0}".format(work_experience_errors)
print "Jobs completed errors: {0}".format(jobs_completed_errors)
print "Jobs category errors: {0}".format(job_category_errors)
print "Gender errors: {0}".format(gender_errors)

merged = merged[merged.bill_rate != 'error']
print "Total after bill rate errors: {0}".format(len(merged))
merged = merged[merged.work_experience != 'error']
print "Total after work experience: {0}".format(len(merged))
merged = merged[merged.gender != 'error']
print "Total after gender errors: {0}".format(len(merged))
merged = merged[merged.education != 'None']
print "Total after education: {0}".format(len(merged))
merged = merged[merged.jobs_completed != 'error']
print "Total after jobs_completed: {0}".format(len(merged))
merged = merged[merged.job_category != 'none']
print "Total after job_category: {0}".format(len(merged))

final_total = len(merged)

print "Percentage used for analysis: {0}%".format(float(final_total/55518) * 100)

# Calculate mean and standard deviation
merged['bill_rate'] = merged.bill_rate.astype('float')
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

# Remove outliers in the dataset
merged = merged[merged['bill_rate'] > mean - 2 * sd]
merged = merged[merged['bill_rate'] < mean + 2 * sd]

print "Total after removing outliers: {0}".format(len(merged))

# Show the outliers IDs
'''
def showOutliers(data):
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    all_ids = outliers.worker_id_x
    all_billrates = outliers.bill_rate
    for person in all_ids:
        print person
        
showOutliers(merged)
'''

# Calculate mean and standard deviation
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

all_bill_rates = merged.bill_rate.astype('float')
merged['bill_rate'] = merged.bill_rate.astype('float')
merged['work_experience'] = merged.work_experience.astype('float')
all_work_experience = merged.work_experience
all_education_id = merged.education_id
all_age_range_id = merged.age_range_id
all_job_category_id = merged.job_category_id
all_genders = merged.gender
female_count = 0
male_count = 0

# Converting covariates to a matrix on a dichotomous scale
def make_dichotomous_matrix(id_value, covariate, final_matrix):
    for option in list(set(covariate)):
        if (id_value == option):
            final_matrix.append(1)
        else:
            final_matrix.append(0)
    return final_matrix
        
# Data formatting 
for gender in all_genders:
    if (gender == "male"):
        gender_array.append(0)
        male_count += 1
    elif (gender == "female"): # Female as the treatment group
        gender_array.append(1)
        female_count += 1

for rate in all_bill_rates:
    rate = round(float(rate), 2)
    bill_rate_array.append(rate)

for row in merged.itertuples():    
    job_category_matrix = []
    education_matrix = []
    age_range_id_matrix = []
    
    individual_covariate_matrix = []
    
    job_category_matrix = make_dichotomous_matrix(row.job_category_id, all_job_category_id, job_category_matrix)
    education_matrix = make_dichotomous_matrix(row.education_id, all_education_id, education_matrix)
    age_range_id_matrix = make_dichotomous_matrix(row.age_range_id, all_age_range_id, age_range_id_matrix)
    
    individual_covariate_matrix.extend(job_category_matrix)
    individual_covariate_matrix.extend(education_matrix)
    individual_covariate_matrix.extend(age_range_id_matrix)
    #individual_covariate_matrix.append(row.work_experience)
    all_covariates_array.append(individual_covariate_matrix)

'''
bill_rate_array = bill_rate_array[0:10000]
gender_array = gender_array[0:10000]
all_covariates_array = all_covariates_array[0:10000]
'''

# Check that arrays contain complete data
print "Bill rate array length: {0}".format(len(bill_rate_array))
print "Gender array length: {0}".format(len(gender_array))
print "Covariate array length: {0}".format(len(all_covariates_array))
print "Female count: {0}, Male count: {1}".format(female_count, male_count)


Total merged: 55518
Bill rate errors: 9588
Education errors: 12083
Work experience errors: 9631
Jobs completed errors: 9588
Jobs category errors: 446
Gender errors: 3457
Total after bill rate errors: 45930
Total after work experience: 45887
Total after gender errors: 42972
Total after education: 31853
Total after jobs_completed: 31853
Total after job_category: 31717
Percentage used for analysis: 0.0%
Bill rate average: 39.8293404168
Bill rate standard deviation: 37.9426751272
Total after removing outliers: 30662
Bill rate average: 35.2472865436
Bill rate standard deviation: 21.5708051154
Bill rate array length: 30662
Gender array length: 30662
Covariate array length: 30662
Female count: 14637, Male count: 16025


In [4]:
Y = np.array(bill_rate_array)
D = np.array(gender_array)
X = np.array(all_covariates_array)

#np.seterr(divide='ignore', invalid='ignore')

causal = CausalModel(Y, D, X)

print(causal.summary_stats)


Summary Statistics

                     Controls (N_c=16025)       Treated (N_t=14637)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y       36.669       21.996       33.690       20.987       -2.979

                     Controls (N_c=16025)       Treated (N_t=14637)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.031        0.172        0.039        0.192        0.044
             X1        0.072        0.259        0.073        0.260        0.003
             X2        0.029        0.167        0.021        0.144       -0.048
             X3        0.200        0.400        0.236        0.425        0.088
             X4        0.006        0.075        0.006        0.080        0.010
      

In [5]:
# Matching each person in the dataset with a comparable person in the dataset

causal.est_via_matching()
print(causal.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.340      0.246     -5.449      0.000     -1.822     -0.858
           ATC     -1.558      0.253     -6.160      0.000     -2.054     -1.062
           ATT     -1.102      0.260     -4.242      0.000     -1.611     -0.593



In [6]:
causal.est_propensity_s()
print (causal.propensity)


Estimated Parameters of Propensity Score

                    Coef.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
     Intercept     -0.576      0.027    -21.405      0.000     -0.629     -0.523
           X18      1.331      0.049     27.424      0.000      1.236      1.426
           X21      0.494      0.026     18.958      0.000      0.443      0.545
           X19      1.381      0.092     14.948      0.000      1.200      1.562
            X8     -0.207      0.040     -5.236      0.000     -0.285     -0.130
           X10      0.299      0.040      7.569      0.000      0.222      0.376
            X3      0.236      0.033      7.219      0.000      0.172      0.300
            X0      0.163      0.071      2.302      0.021      0.024      0.302
           X22     -0.449      0.107     -4.200      0.000     -0.658     -0.239
            X9     -0.197      0.062     -3.149      0.002     -0.

In [7]:
# Trimming

causal.trim_s()
causal.cutoff
print (causal.summary_stats)


Summary Statistics

                     Controls (N_c=16005)       Treated (N_t=14479)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y       36.680       21.997       33.790       21.038       -2.889

                     Controls (N_c=16005)       Treated (N_t=14479)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.029        0.169        0.030        0.170        0.002
             X1        0.072        0.259        0.074        0.262        0.006
             X2        0.029        0.167        0.021        0.145       -0.046
             X3        0.200        0.400        0.239        0.426        0.093
             X4        0.006        0.075        0.006        0.080        0.011
      

In [8]:
causal.est_via_ols()
causal.est_via_weighting()
causal.est_via_matching(bias_adj=True)
print(causal.estimates)


Treatment Effect Estimates: Weighting

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.438      0.239     -6.004      0.000     -1.907     -0.968

Treatment Effect Estimates: OLS

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.344      0.239     -5.633      0.000     -1.811     -0.876
           ATC     -1.548      0.280     -5.533      0.000     -2.097     -1.000
           ATT     -1.117      0.250     -4.478      0.000     -1.606     -0.628

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE     -1.332      0.246     -5.421      0.000     -1.813     -0.