In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import time
import datetime

from shared.comparison import time_comparison

# DataFrames Filtering

In [None]:
employees = pd.read_csv("data_files/employees.csv")
employees.info()
# If you want parse dates on read you can also do this:
# employees = pd.read_csv("data_files/employees.csv", parse_dates=["Start Date", "Last Login Time"], dtype = {"Senior Management": "bool"})
# NOTE: This will fail because the bool column "Senior Management" has some NaN entries, additional logic will be needed

In [None]:
employees.head(10)

In [None]:
employees.info()

## In order to filter, it can be helpful to ensure data types are as expected

Let's do some filtering on dates by first ensuring columns are in the right format. This is what we are doing:

1. Set `Start Date` column to datetime
2. Set `Senior Management` column to boolean
3. Set the `Gender` column to a category

In [None]:
employees["Start Date"] = pd.to_datetime(employees["Start Date"])
employees["Senior Management"] = employees["Senior Management"].astype("bool")
employees["Gender"] = employees["Gender"].astype("category")

In [None]:
employees.info()

# Notes on Filtering and Masks

The get item method on a data frame accepts what is known as a `mask`.

A mask is a sequence of `True` and `False` values.  A `mask` in the get method will filter based upon the boolean values.

## People Hired After 2000

In [None]:
date_2000 = datetime.datetime(2000, 1, 1)
after_2000 = employees[employees["Start Date"] >= date_2000].copy()
after_2000.sort_values("Start Date", inplace = True, ascending = False)

In [None]:
after_2000.head()

## Get All The Males

In [None]:
employees[employees["Gender"] == "Male"]

## All People in the Finance Team

In [None]:
employees["Team"].value_counts()

In [None]:
employees[employees["Team"] == "Finance"]

## All People Making More Than 100,000

In [None]:
employees["Salary"].mean()

In [None]:
highly_paid = employees[employees["Salary"] > 100000]

In [None]:
highly_paid.sort_values("Salary", ascending = False).head()