In [None]:
import httplib2
import oauth2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gender_detector import GenderDetector 
import psycopg2, psycopg2.extras
from causalinference import CausalModel
from causalinference.utils import random_data
import httplib
import base64
import json # For Microsoft Face API
import urllib as urllib # For Microsoft Face API
import time 
import csv
import datetime 


class UpworkDataFormatter:
    
    def __init__(self):
        # Settings
        self.present_date = "12/2017" # This is the month in which the data was collected
        self.edu_data_file_name = './csv_files/debug_2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data
        self.edu_data_log_file_name = './log_files/debug_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt'
        
        # Write a log
        self.log = open(self.edu_data_log_file_name, 'a')
        self.log.write("We have started analyzing data!" + "\n")
        self.log.flush()

        # Connect to the database 
        self.conn = psycopg2.connect("dbname=eureka01")
        self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
        
        # Get detailed_info from workers in our database
        self.cur.execute("SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;")
        
        # Initialize arrays for Causal Analysis 
        self.user_count = 1
        self.error_education_id = 0
        self.error_job_category_id = 0
        self.no_edu_history_count = 0
        self.schools_no_degree = []
        self.ambiguous_edu_count = 0
        self.list_of_ambiguous_degrees = []        
        self.other_info = []
        self.diploma_info = []
        self.ambiguous_info = []
        self.none_specified_info = []
        self.empty_info = []
        self.high_school_degrees = []
    
    def show_education(self):
        for user in self.cur:
            try: 
                self.return_edu_experience(user)
            except Exception as error:
                print "We ran into an error"
                print error
                continue
                
    def save_to_csv(self):
        with open(self.edu_data_file_name, 'w') as csvfile:
            fieldnames = ['user_count','degree','education', 'education_id']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            for user in self.cur:
                try: 
                    user_count = self.user_count
                    degree = self.show_degree(user)
                    education = self.calculate_education(user)
                    if (education == "No history"):
                        self.no_edu_history_count += 1
                    education_id = self.encode_category(education, self.education_id_list)
                    
                    writer.writerow({'user_count': self.user_count,'degree': degree, 'education': education, 
                                 'education_id': education_id})
                except:
                    print "Ran into some error at user {0}".format(self.user_count)
                    
                    writer.writerow({'user_count': self.user_count, 'degree': "error", 'education': "error", 
                                 'education_id': self.error_education_id})
                    
                print "Finished writing data for {0}".format(self.user_count)
                self.user_count += 1
    
    def return_edu_experience(self, user):
        all_education = []
        try: 
            ed_history = user[0]["education"]["institution"]
            
            if (ed_history == ""):
                return "No History"
            
            if (type(ed_history) == dict):
                degree = ed_history["ed_degree"]
                school = ed_history["ed_school"]
                comment = ed_history["ed_comment"]
                has_bachelor_or_higher = re.search("Professional|Doctorate|Master|Bachelor", self.check_education(degree, school, comment))
                has_some_college = re.search("Associate|Certificate|Some College", self.check_education(degree, school, comment))
                has_high_school = re.match("High School", self.check_education(degree, school, comment))
                has_less_than_high_school = re.match("Less Than High School", self.check_education(degree, school, comment))
                
                if (has_high_school): # Check for high school first 
                    all_education.append("High School")
                elif (has_less_than_high_school):
                    all_education.append("Less Than High School")
                elif (has_bachelor_or_higher):
                    all_education.append("Bachelor or Higher")
                elif (has_some_college):
                    all_education.append("Some College or Associate")
                else:
                    all_education.append("Other")

            elif (type(ed_history) == list):
                for item in ed_history:
                    degree = item["ed_degree"]
                    school = item["ed_school"]
                    comment = item["ed_comment"]
                
                    has_bachelor_or_higher = re.search("Professional|Doctorate|Master|Bachelor", self.check_education(degree, school, comment))
                    has_some_college = re.search("Associate|Certificate|Some College", self.check_education(degree, school, comment))
                    has_less_than_high_school = re.search("Less Than High School", self.check_education(degree, school, comment))
                    has_high_school = re.search("High School", self.check_education(degree, school, comment))

                    if (has_high_school): # Check for high school first 
                        all_education.append("High School")
                    elif (has_less_than_high_school):
                        all_education.append("Less Than High School")
                    elif (has_bachelor_or_higher):
                        all_education.append("Bachelor or Higher")
                    elif (has_some_college):
                        all_education.append("Some College or Associate")
                    else:
                        all_education.append("Other")
        
        except Exception as error:
            print "We ran into an error"
            print error
            return "Ambiguous"
        
    def check_education(self, degree, school, comment): # Do a quick check that the user has a high school degree
        try:
            hs_degree_mentioned = re.search("high school", degree.lower())
            hs_word_mentioned = re.search("high school", school.lower())
            diploma_degree_mentioned = re.search("diploma", degree.lower())
            graduated_mentioned = re.search("graduated", comment.lower())

            if (hs_degree_mentioned or (hs_word_mentioned and (graduated_mentioned or diploma_degree_mentioned))):
                return "High School"
            elif (hs_word_mentioned and degree == ""):
                return "Less Than High School"
            elif (self.check_school_by_name("high_school", school)): # Check if the school name is in the database
                print "We found the high school!"
                if (hs_degree_mentioned or diploma_degree_mentioned or graduated_mentioned):
                    return "High School"
                elif (degree == ""):
                    return "Less Than High School"
            else:
                print "We did not find the high school"
                print "We should be checking the specific degree now!! "
                self.check_specific_degree(degree, school, comment)
        except Exception as error:
            print "Something went wrong"
            print error
            return "Error"
        
    def check_specific_degree(self, degree, school, comment): # Do a quick check that the user has a bachelor or associate degree
        print "We are checking specific degrees"
        try:
            # Professional degree
            if (re.search('d\.c|d\.c\.m|d\.d\.s|d\.m\.d|ll\.b|ll\.m|l\.l\.m|l\.l\.b|j\.d\.|m\.d\.|o\.d\.|d\.o\.|pharm\.d', degree.lower()) 
                or re.search('d\.p\.m|d\.p|pod\.d|m\.div|m\.h\.l|b\.d|ordination|d\.v\.m| law', degree.lower())
                or re.search('DC|DCM|DDS|DMD|LLB|LLM|LLB|JD|MD|OD|DO|PharmD|Ed\.D|EdD|CFA', degree)
                or re.search('DPM|DP|PodD|MDiv|MHL|BD|DVM', degree)):
                return "Professional"

            # PhD
            elif (re.search('Ph\.D|PhD', degree)
                 or re.search('doctor|licentiate', degree.lower())): # Doctorate placed here in case MDs are also called doctors 
                return "Doctorate"

            # Masters
            elif (re.search('M\.B\.A|MBA|M\.A|M\.S|M\.Sc|M\.E|MA|MS|MSc|ME|M\.A\.Ed|MAEd|MST|M\.S\.T|MEd|M\.Ed|MPA|M\.P\.A', degree)
                 or re.search('master|engineer\'s degree', degree.lower())
                 or re.search('MFA|M\.F\.A|MSC|MCS', degree)):
                return "Master"

            # Bachelors
            elif (re.search('B\.A|B\.S|B\.Sc|BA|BS|BSc|BFA|AB|A\.B|BPharm|B\.Pharm', degree)
                 or re.search('bachelor|bpharm|bachlor|batchelor', degree.lower())):
                return "Bachelor"

            # Associate's Degree
            elif (re.search('AA|AS|A\.A|A\.S|A\.A\.S|AAS', degree)
                 or re.search('associate|foundation', degree.lower())):
                return "Associate"

            # Certificate
            elif (re.search('certificate', degree.lower())):
                return "Certificate"
            
            else:
                print "We should be checking colleges now"
            
                # Check for a specific college name
                if (check_school_by_name("college", school)):
                    return "Some College"
                
                # Everything else
                else:
                    return "Ambiguous" 
            
        except Exception as error:
            print "We ran into an error specifying the degree"
            print error
            return "Error"

            
    def check_school_by_name(self, dataset, name): # Checks if school name is in a dataset of schools 
        name = name.lower()
        name = self.remove_punctuation(name)
        private_schools = 'high_schools_in_us.csv'
        public_schools = 'high_schools_in_us.csv'
        colleges = 'colleges_in_us.csv'
        
        if (dataset == "high_school"): # Check against list of private and public high schools
            with open(private_schools, 'r') as f:
                private_school_data = csv.DictReader(f)
                for row in private_school_data:
                    school = row["pinst"].lower()
                    school = self.remove_punctuation(school)
                    if (name == school):
                        return True
                    
            with open(public_schools, 'r') as f: # Haven't been able to find a suitable dataset 
                public_school_data = csv.DictReader(f)
                for row in public_school_data:
                    school = row["pinst"].lower()
                    school = self.remove_punctuation(school)
                    if (name == school):
                        return True
            
            return False # Return false if we can't find the school name in any of the lists
                
        elif (dataset == "college"): # Check against list of colleges
            print "We are checking the college name now!"
            with open(colleges, 'r') as f:
                college_data = csv.DictReader(f)
                for row in college_data:
                    school = row["INSTNM"].lower()
                    school = self.remove_punctuation(school)
                    if (name == school):
                        print "Found a matching college!"
                        print name
                        return True
            
            return False # Return false if we can't find the school name in any of the lists
        
        else:
            return False # Return false if all else fails 
    
    def remove_punctuation(self, word): # Removes all punctuation and blank spaces from a string
        modified_word = re.sub(r'[^\w\s]','',word)
        modified_word = re.sub(' ', '', modified_word)
        return modified_word
                    
    def show_degree(self, user):
        try: 
            ed_history = user[0]["education"]["institution"]
            if(ed_history == ""):
                return "No edu listed"
            elif(type(ed_history) == dict):
                return ed_history["ed_degree"]
            elif(type(ed_history) == list):
                degree_list = []
                for ed_experience in user[0]["education"]["institution"]:
                    if (ed_experience["ed_degree"] == ""):
                        degree_list.append("none")
                    elif (ed_experience["ed_degree"] != ""):
                        degree_list.append(ed_experience["ed_degree"])
                return degree_list
            else:
                return "Error: Some other error"
                
        except:
            return "Error: No education"
    
    
    def encode_category(self, item, id_list):
        if not(item in id_list):
            id_list.append(item)
        
        category_index = id_list.index(item)
        return category_index    
                    
                    
myObject = UpworkDataFormatter()
myObject.show_education()