In [1]:
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load in the data
missing_values = ["n/a", "na", "--", "-", "NA"]
payroll = pd.read_csv("city_payroll.csv", na_values = missing_values)

In [None]:
payroll.head()

In [None]:
# number of instances in the original dataset which is pretty huge
payroll.shape

In [None]:
# sample the data to make it more manageable
payroll_sample = payroll.sample(int(len(payroll) / 20))

In [None]:
print("Original Dataset Observation Count: " + str(len(payroll)))
print("Sampled Dataset Observation Count: " + str(len(payroll_sample)))

In [None]:
# save sampled dataset
payroll_sample.to_csv("city_payroll_sample.csv")

In [None]:
# number of features
payroll_sample.shape[1]

In [None]:
# printing feature names
print("Feature Names: \n\n" + "\n".join(payroll_sample.columns))

In [None]:
# data types
print("Datatypes: \n\n", payroll_sample.info())

In [None]:
# converting Fiscal Year and Agency Start Date to Datetime64
payroll_sample["Agency Start Date"] = pd.to_datetime(payroll_sample["Agency Start Date"], errors = "coerce")
payroll_sample["Agency Name"] = payroll_sample["Agency Name"].astype(str)

print("", payroll_sample.dtypes)

In [None]:
# any missing values?
payroll_sample.isnull().values.any()

In [None]:
# total number of missing values
payroll_sample.isnull().sum().sum()

In [None]:
# how many missing values per feature?
payroll_sample.isnull().sum()

In [None]:
# percentage of missing values for each feature

values = payroll_sample.isnull().sum().sort_values(ascending=False)
percent = payroll_sample.isnull().mean().sort_values(ascending=False)
percent *= 100
missing_data = pd.concat([values, percent], axis=1, keys=['Values', 'Percent'])
missing_data.head(20)

### Since Work Location Borough is important we'll keep it and take care of 1 observation in title description variable and keep the variable

In [None]:
# dealing or removing missing data with greater than 20 percent data missing
payroll_sample = payroll_sample.drop((missing_data[missing_data['Percent'] > 20]).index,1)
payroll_sample = payroll_sample.drop(payroll_sample.loc[payroll_sample['Title Description'].isnull()].index)
payroll_sample.isnull().sum().max()

### Since First and Last names are not as important for our project we'll just drop them

In [None]:
payroll_sample = payroll_sample.drop(["First Name", "Last Name"], axis=1)

In [None]:
values = payroll_sample.isnull().sum().sort_values(ascending=False)
percent = payroll_sample.isnull().mean().sort_values(ascending=False)
percent *= 100
missing_data = pd.concat([values, percent], axis=1, keys=['Values', 'Percent'])
missing_data.head()

In [None]:
# any missing values?
payroll_sample.isnull().values.any()

In [None]:
# correlation

corrmap = payroll_sample.corr()
f, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corrmap, vmax=1, square=True)

In [None]:
pearsoncorr = payroll_sample.corr(method='pearson')
pearsoncorr

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(pearsoncorr, 
            xticklabels=pearsoncorr.columns,
            yticklabels=pearsoncorr.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)

In [None]:
# to be continued...

In [None]:
# total pay of Regular Gross Paid and Total OT Paid (OT = overtime)
payroll_sample["Total Pay"] = payroll_sample["Regular Gross Paid"] + payroll_sample["Total OT Paid"]
payroll_sample.head()

In [None]:
# counts of municipal employees
    # year-to-year
    # agency-to-agency
    # borough-to-borough

In [None]:
# distributions of base pay
    # whole
    # by borough 

In [None]:
# distributions of actual pay
    # whole
    # by borough 

In [None]:
# hypothesis testing for actual pay

In [None]:
# distributions of the difference between base pay and actual pay

In [None]:
# distributions of overtime

In [None]:
# bar graph of the pay of active and ceased employees

In [None]:
# bar graph of pay basis

In [None]:
# regular hours vs. base salary scatterplot

In [None]:
# Pearson's R for regular hours vs base salary

In [None]:
# agency start date vs. actual pay

In [None]:
# double bar graph of base salary and regular gross paid 
    # by borough
    # by agency 

In [None]:
# swarm plot of borough vs. regular gross paid 

In [None]:
# swarm plot of fiscal year vs. regular gross paid

In [None]:
# total hours (regular hours + OT Hours) vs regular gross paid

In [None]:
# K-Nearest Neighbor Regressor with k = 5 to predict regular gross paid
    # manually compute for one random observation
    # create a 80/20 train-test split and utilize a k-nearest neighbor algorithm
        # compute mse and rmse
        # regular gross paid vs residuals 