# Data Analyses: Outliers

In [6]:
import httplib2
import urllib3
import types
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import base64
import time 
import csv
import datetime 

# Read data from csv files and reformat
all_data_file_name = './csv_files/11_21_2017_upwork_analysis_worldwide_allskills.csv' # Filename for all data
gender_data_file_name = './csv_files/11_20_2017_upwork_gender_analysis_worldwide_allskills.csv' # Filename for gender data
df = pd.read_csv(all_data_file_name)
df_gender = pd.read_csv(gender_data_file_name)

# Merge the files, then remove rows with an error, no gender identified, or ambiguous education
merged = df.merge(df_gender, on='user_count')
merged = merged[merged.bill_rate != 'error']
merged = merged[merged.gender != 'unidentified']
merged = merged[merged.gender != 'error']
merged = merged[merged.education != 'None']
merged = merged[merged.job_category != 'none']
merged = merged[merged.country == 'United States'] # Look only at data in the United States, which is the largest dataset

all_bill_rates = merged.bill_rate.astype('float')
merged['bill_rate'] = merged.bill_rate.astype('float')
merged['work_experience'] = merged.work_experience.astype('float')
all_work_experience = merged.work_experience
all_education_id = merged.education_id
all_job_category_id = merged.job_category_id
all_country_id = merged.country_id
all_genders = merged.gender
female_count = 0
male_count = 0

def showOutliers(data):
    data['bill_rate'] = data.bill_rate.astype('float')
    sd = np.std(data['bill_rate'])
    mean = np.mean(data['bill_rate'])
    coeff_of_variation = float(sd/mean)
    
    print("Bill rate average: {0}".format(mean))
    print("Bill rate standard deviation: {0}".format(sd))
    print("Coefficient of variation: {0}".format(coeff_of_variation))

    totalOutliers = 0
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    all_ids = outliers.worker_id_x # To show all worker ids, use this variable
    all_billrates = outliers.bill_rate
    for person in all_ids:
        totalOutliers += 1
    print totalOutliers

def returnOutliers(data):
    data['bill_rate'] = data.bill_rate.astype('float')
    sd = np.std(data['bill_rate'])
    mean = np.mean(data['bill_rate'])
    outliers = data[data['bill_rate'] > mean + 2 * sd]
    return outliers

print "Total outliers (overall average): "
showOutliers(merged)
    
# Find outliers in each of the job categories
categories = ['Accounting & Consulting', 'Admin Support', 'Customer Service', 'Data Science & Analytics',
                 'Design & Creative', 'Engineering & Architecture', 'IT & Networking', 'Legal', 
                 'Sales & Marketing', 'Translation', 'Web, Mobile & Software Dev', 'Writing']

def showJobOutliers(data, category):
    outliers_data = data[data.job_category == category]
    showOutliers(outliers_data)
    
total_job_outliers = 0

for category in categories:
    print "Total outliers for {0}".format(category)
    showJobOutliers(merged, category)

print total_job_outliers

# Calculate mean and standard deviation
sd = np.std(merged['bill_rate'])
mean = np.mean(merged['bill_rate'])
print("Bill rate average: {0}".format(mean))
print("Bill rate standard deviation: {0}".format(sd))

outliers = returnOutliers(merged)
non_outliers = merged

# Make some new dataframes

def calculate_percentage(data, category_of_data, value):
    numerator = 0
    denominator = 0
    # Something wrong with the row reference here 
    for row in data:
        denominator += 1
    
    return float(numerator/denominator) * 100

d_by_jobs = {}
for job in categories:
    d_by_jobs[job] = []
    
print d_by_jobs

# Make some crosstabs tables
# non_outliers.plot(kind="bar", figsize=(8,8))


Total outliers (overall average): 
Bill rate average: 40.0665808824
Bill rate standard deviation: 40.2265996757
Coefficient of variation: 1.00399382203
46
Total outliers for Accounting & Consulting
Bill rate average: 56.2361842105
Bill rate standard deviation: 39.0737499031
Coefficient of variation: 0.694815099062
4
Total outliers for Admin Support
Bill rate average: 25.5606293706
Bill rate standard deviation: 17.6162442207
Coefficient of variation: 0.689194462517
5
Total outliers for Customer Service

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Bill rate average: 25.42
Bill rate standard deviation: 15.6134477294
Coefficient of variation: 0.614219029482
1
Total outliers for Data Science & Analytics
Bill rate average: 42.81
Bill rate standard deviation: 25.0377780745
Coefficient of variation: 0.584858165721
3
Total outliers for Design & Creative
Bill rate average: 37.8717270195
Bill rate standard deviation: 27.0938912278
Coefficient of variation: 0.71541208601
16
Total outliers for Engineering & Architecture
Bill rate average: 49.5532
Bill rate standard deviation: 30.2437749919
Coefficient of variation: 0.610329403386
1
Total outliers for IT & Networking
Bill rate average: 45.5238095238
Bill rate standard deviation: 33.440993232
Coefficient of variation: 0.734582487314
2
Total outliers for Legal
Bill rate average: 104.0
Bill rate standard deviation: 71.1563824912
Coefficient of variation: 0.684195985492
1
Total outliers for Sales & Marketing
Bill rate average: 54.9195744681
Bill rate standard deviation: 38.294015307
Coefficien