In [78]:
import httplib2
import oauth2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gender_detector import GenderDetector 
import psycopg2, psycopg2.extras
from causalinference import CausalModel
from causalinference.utils import random_data
import httplib
import base64
import json # For Microsoft Face API
import urllib as urllib # For Microsoft Face API
import time 
import csv

class UpworkAnalyzer:
    
    def __init__(self):
        # Settings
        self.present_date = "10/2017" # This is the month in which the data was collected
        
        # Write a log
        self.log = open('log_upwork_data_analysis_2017_11_08_worldwide_allskills_trial.txt', 'a')
        self.log.write("We have started analyzing data!" + "\n")

        # Connect to the database 
        self.conn = psycopg2.connect("dbname=eureka01")
        self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
        
        # Get detailed_info from workers in our database
        self.cur.execute("SELECT detailed_info FROM upwork_worldwide_allskills_2017_10_21 LIMIT 100;")
        
        # Initialize arrays for Causal Analysis 
        self.user_count = 1
        self.bill_rate_array = []
        self.gender_array = []
        self.all_covariates_array = []
    
    def save_to_csv(self):
        with open('11_9_2017_upwork_analysis_trial.csv', 'w') as csvfile:
            fieldnames = ['user_count','worker_id', 'first_name', 'gender', 'bill_rate', 
                          'country', 'education', 'work_experience', 'job_category']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            for user in self.cur:
                print "We're on user " + str(self.user_count)
                user_count = self.user_count
                worker_id = user[0]["ciphertext"]
                first_name = user[0]["dev_first_name"]
                gender = "temporary"
                bill_rate = user[0]["dev_bill_rate"]
                country = user[0]["dev_country"]
                education = self.calculate_education(user)
                work_experience = self.calculate_work_experience(user)
                job_category = self.identify_job_category(user)
            
                print "Writing data for worker " + str(self.user_count)
                writer.writerow({'worker_id': worker_id, 'first_name': first_name, 'gender': gender, 
                             'bill_rate': bill_rate, 'country': country, 'education': education, 
                             'work_experience': work_experience, 'job_category': job_category})
                self.user_count += 1
        
    def identify_gender(self): # Returns gender as a string
        if (self.user_count%20 == 0): # Set timeout for rate limiting 
            time.sleep(65)

        "Recognizing Face Number " + str(self.user_count)
        raw_face_data = self.recognize_faces()

        if (raw_face_data == "error"):
            return raw_face_data

        elif (len(raw_face_data) == 0): 
            return "none"

        else:
            gender = raw_face_data[0]["faceAttributes"]["gender"]
            return str(gender)
        
    def recognize_faces(self):
        # Replace the subscription_key string value with your valid subscription key.
        subscription_key = '1fad9b94ae3d44fcaa17a6848f06d086' # This is key 1

        # Replace or verify the region.
        uri_base = 'westcentralus.api.cognitive.microsoft.com'

        # Request headers.
        headers = {
            'Content-Type': 'application/json',
            'Ocp-Apim-Subscription-Key': subscription_key,
        }

        # Request parameters.
        params = urllib.urlencode({
            'returnFaceId': 'true',
            'returnFaceLandmarks': 'false',
            'returnFaceAttributes': 'age,gender,headPose,smile,facialHair,glasses,emotion,hair,makeup,occlusion,accessories,blur,exposure,noise',
        })

        # The URL of a JPEG image to analyze.
        beg_body = "{'url':"
        end_body = "}"
        path = "https://raw.githubusercontent.com/efoongch/upwork-pay-by-gender/master/resized2_images_trial/" + str(self.user_count) + ".jpg"
        body = beg_body + '"' + path + '"' + end_body
        
        try:
            # Execute the REST API call and get the response.
            conn = httplib.HTTPSConnection('westcentralus.api.cognitive.microsoft.com')
            conn.request("POST", "/face/v1.0/detect?%s" % params, body, headers)
            response = conn.getresponse()
            data = response.read()
            parsed = json.loads(data)
            return parsed
            
        except Exception as e:
            print("[Errno {0}] {1}".format(e.errno, e.strerror))
            return "error"
            

        '''
        # 'data' contains the JSON data. The following formats the JSON data for display.
        parsed = json.loads(data)
        print ("Response:")
        print (json.dumps(parsed, sort_keys=True, indent=2))
        conn.close()
        '''
    
    def calculate_work_experience(self, user):
        total_experience = 0
        try: 
            work_experience_list = user[0]["experiences"]["experience"]
            
        except:
            return "none"
        
        if (type(work_experience_list) is list):
            for experience in work_experience_list:
                start_date = experience["exp_from"]
                end_date = experience["exp_to"]
                total_experience += self.calculate_months(start_date, end_date)
            return total_experience 
            
        elif (type(work_experience_list) is dict):
            start_date = work_experience_list["exp_from"]
            end_date = work_experience_list["exp_to"]
            total_experience = self.calculate_months(start_date, end_date)
            return total_experience 
        
        else: 
            return "none"
    
    def calculate_months(self, start_date, end_date):
        number_of_months = 0
        present_month = int(self.present_date[0:2])
        present_year = int(self.present_date[5:7])
        start_month = int(start_date[0:2])
        start_year = int(start_date[5:7])
        
        if (end_date == "Present"):
            if (start_year == present_year):
                number_of_months = present_month - start_month # Don't count the first month so users who changed jobs immediately don't get exp. advantage
                return number_of_months
                
            else:
                if(start_year < present_year):
                    number_of_months += 12 - start_month # Don't count the first month so users who changed jobs immediately don't get exp. advantage
                    start_year += 1
                    start_month = 1
                    while (start_year < present_year):
                        number_of_months += 13 - start_month 
                        start_year += 1
                        start_month = 1
                number_of_months += present_month
                return number_of_months
                
        else:
            end_month = int(end_date[0:2])
            end_year = int(end_date[5:7])
            
            if(start_year < end_year):
                number_of_months += 12 - start_month # Don't count the first month so users who changed jobs immediately don't get exp. advantage
                start_year += 1
                start_month = 1
                while (start_year < end_year):
                    number_of_months += 13 - start_month 
                    start_year += 1
                    start_month = 1
            number_of_months += end_month
            return number_of_months
    
    def calculate_education(self, user):
        try: 
            ed_history = user[0]["education"]["institution"]
            if(ed_history == ""):
                print "none"
                return "none"
            elif(type(ed_history) == dict):
                print "This is the degree {0}".format(ed_history["ed_degree"])
                self.check_highest_education(ed_history["ed_degree"])
                return "default"
            elif(type(ed_history) == list):
                for ed_experience in user[0]["education"]["institution"]:
                    if (ed_experience["ed_degree"] == ""):
                        print "This is the degree: none"
                        self.check_highest_education("none")
                        return "none"
                    print "This is the degree {0}".format(ed_experience["ed_degree"])
                    self.check_highest_education(ed_experience["ed_degree"])
                return "default"
            
        except:
            return "none"
    
    def check_highest_education(self, degree):
        # Checks for the highest level of education attained by user
        
        # Bachelors
        if (re.search('bachelor', degree.lower())):
            print "BACHELOR'S DEGREE"
            return "BACHELOR'S DEGREE"
        else:
            print "OTHER"
            return "OTHER"
    
    def identify_job_category(self, user):
        all_job_categories = user[0]["dev_job_categories_v2"]["dev_job_categories_v"]
        try:
            job_category = all_job_categories[0]["groups"]["group"]["name"] # Returns general job category
        except:
            job_category = all_job_categories["groups"]["group"]["name"]
        return job_category
    
    def transform_country_id(self):
        return 
    
    def transform_job_category(self):
        return 
    
    def make_covariate_matrix(self):
        return 
    
    def causal_analysis(self):
        return 

In [79]:
myObject = UpworkAnalyzer()
myObject.save_to_csv()

We're on user 1
This is the degree Bachelor of Arts (B.A.)
BACHELOR'S DEGREE
Writing data for worker 1
We're on user 2
This is the degree Engineer's degree
OTHER
This is the degree: none
OTHER
Writing data for worker 2
We're on user 3
This is the degree Bachelor of Engineering (B.Eng.)
BACHELOR'S DEGREE
This is the degree High School
OTHER
Writing data for worker 3
We're on user 4
Writing data for worker 4
We're on user 5
This is the degree Bachelor of Fine Arts (B.F.A.)
BACHELOR'S DEGREE
Writing data for worker 5
We're on user 6
This is the degree Bachelors
BACHELOR'S DEGREE
Writing data for worker 6
We're on user 7
This is the degree Bachelor of Engineering (B.Eng.)
BACHELOR'S DEGREE
Writing data for worker 7
We're on user 8
This is the degree: none
OTHER
Writing data for worker 8
We're on user 9
This is the degree Bachelor of Business Administration (B.B.A.)
BACHELOR'S DEGREE
Writing data for worker 9
We're on user 10
This is the degree 
OTHER
Writing data for worker 10
We're on use

KeyError: 'dev_job_categories_v2'

In [7]:
print "Hello world"

Hello world


In [29]:
for user in cur:
    print "Now we're on user number {0}".format(user_count)
    try:
        # Outcome - Bill Rate (Y)

        # Treatment - Gender (D)
        
        # Matrix of Covariates - Work Experience, Education, Country, Job Category (X)
        
        worker_id = user[0]["ciphertext"]
        print worker_id
        first_name = user[0]["dev_first_name"]
        print first_name
        
        gender_by_detector = "none"
        user_count += 1
        

    except:
        print "This one has no detailed info: {0}".format(user_count)
        no_detailed_info_count += 1

print bill_rate_array 

Now we're on user number 0
~019ef3758cf73d289c
Valentina
Now we're on user number 1
~014c24de20d2446d42
Marwen
Now we're on user number 2
~01f5a19ca9880b867f
Bruno
Now we're on user number 3
~0191a24ec4aadf154b
Matt
Now we're on user number 4
~019309dfcb2b02ed10
Orkun
Now we're on user number 5
~01dc5d676329574590
Ganang
Now we're on user number 6
~016e70f671e01d08f5
Yash
Now we're on user number 7
~0193675f0ca4d4c14a
Alex
Now we're on user number 8
~0125bee9e4f17ef78e
Rayhan
Now we're on user number 9
~01326a7a95a824e410
Minas


In [2]:
# Testing CausalModel 

Y = np.array([13.00, 15.00, 16.00, 14.00, 14.50, 16.50, 18.50])
D = np.array([0, 1, 0, 1, 1, 1, 0])
X = np.array([[1, 1], [2, 2], 
              [2, 1], [1, 1], 
              [2, 1], [1, 2],[2, 2]])
causal = CausalModel(Y, D, X)
causal.est_propensity()

print (causal.propensity)

ValueError: Too few control units: N_c < K+1