# Data Analyses: Outliers

In [23]:
import httplib2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import base64
import time 
import csv
import datetime 

# Read data from csv files and reformat
all_data_file_name = './csv_files/10_21_2017_upwork_analysis_worldwide_allskills.csv' # Filename for all data
gender_data_file_name = './csv_files/11_20_2017_upwork_gender_analysis_worldwide_allskills.csv' # Filename for gender data
df = pd.read_csv(all_data_file_name)
df_gender = pd.read_csv(gender_data_file_name)

# Merge the files, then remove rows with an error, no gender identified, or ambiguous education
merged = df.merge(df_gender, on='user_count')
merged = merged[merged.bill_rate != 'error']
merged = merged[merged.gender != 'unidentified']
merged = merged[merged.gender != 'error']
merged = merged[merged.education != 'None']
merged = merged[merged.job_category != 'none']
merged = merged[merged.country == 'United States'] # Look only at data in the United States, which is the largest dataset

# Format bill rate and work experience properly 
merged['bill_rate'] = merged.bill_rate.astype('float')
merged['work_experience'] = merged.work_experience.astype('float')

mean = np.mean(merged['bill_rate'])
sd = np.std(merged['bill_rate'])

print "Mean: {0}".format(mean)
print "Standard deviation: {0}".format(sd)

merged['outlier'] = merged['bill_rate'] >= mean + 2*sd

merged.groupby(['outlier', 'education']).size()

Mean: 40.0665808824
Standard deviation: 40.2265996757


outlier  education   
False    Associate       100
         Bachelor        800
         Doctorate        29
         High School      31
         Master          321
         Professional     33
True     Associate         1
         Bachelor         25
         Doctorate         2
         High School       1
         Master           14
         Professional      3
dtype: int64

In [None]:
def showOutliers(data):
    data['bill_rate'] = data.bill_rate.astype('float')
    sd = np.std(data['bill_rate'])
    mean = np.mean(data['bill_rate'])
    coeff_of_variation = float(sd/mean)
    
    print("Bill rate average: {0}".format(mean))
    print("Bill rate standard deviation: {0}".format(sd))
    print("Coefficient of variation: {0}".format(coeff_of_variation))

    totalOutliers = 0
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    all_ids = outliers.worker_id_x # To show all worker ids, use this variable
    all_billrates = outliers.bill_rate
    for person in all_ids:
        totalOutliers += 1
    print totalOutliers

def returnOutliers(data):
    data['bill_rate'] = data.bill_rate.astype('float')
    sd = np.std(data['bill_rate'])
    mean = np.mean(data['bill_rate'])
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    return outliers

print "Total outliers (overall average): "
showOutliers(merged)
    
# Find outliers in each of the job categories
categories = ['Accounting & Consulting', 'Admin Support', 'Customer Service', 'Data Science & Analytics',
                 'Design & Creative', 'Engineering & Architecture', 'IT & Networking', 'Legal', 
                 'Sales & Marketing', 'Translation', 'Web, Mobile & Software Dev', 'Writing']

def showJobOutliers(data, category):
    outliers_data = data[data.job_category == category]
    showOutliers(outliers_data)
    
total_job_outliers = 0

for category in categories:
    print "Total outliers for {0}".format(category)
    showJobOutliers(merged, category)

print total_job_outliers

# Calculate mean and standard deviation
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

outliers = returnOutliers(merged)
non_outliers = merged

# Make some new dataframes

def calculate_percentage(data, category_of_data, value):
    numerator = 0
    denominator = 0
    # Something wrong with the row reference here 
    for row in data:
        denominator += 1
    
    return float(numerator/denominator) * 100

d_by_jobs = {}
for job in categories:
    d_by_jobs[job] = []
    
print d_by_jobs

# Make some crosstabs tables
# non_outliers.plot(kind="bar", figsize=(8,8))