# City of Houston
## 2014

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

# Data directory

In [3]:
location = 'houston'
year = '2014'

In [4]:

# data folder path
data_directory = os.path.join('..','data','raw_data/{}/{}/'.format(location,year))
data_directory_saves = os.path.join( '..','data','clean_data','{}/{}/'.format(location,year))

In [5]:
# create directory
if not os.path.exists(data_directory_saves):
    os.makedirs(data_directory_saves)

In [30]:
# combine all files into one df
all_files = glob.glob(os.path.join(data_directory, "*.xlsx"))[0] 
df = pd.read_excel(all_files)

In [31]:
df.head()

Unnamed: 0,CITY OF HOUSTON,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,"EMPLOYEE LISTING AS OF JULY 17, 2014",,,,,,,,
1,,,,,,,,,
2,LAST NAME,FIRST NAME,MIDDLE NAME,CLASSIFICATION,DEPARTMENT,RACE,GENDER,HIRE DATE,ANNUALIZED BASE PAY
3,RUSSELL,ANNA,A,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28 00:00:00,109468
4,JONES,JOHN,L,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30 00:00:00,22734


## Organize columns
Column names are on row 2
- change column names to row 2
- delete rows

In [32]:
df.columns =list(df.iloc[2])  # convert row 2 as a list and labe it as column names
df.drop(df.index[0:3],axis=0,inplace=True)  # drop rows between that are null
df.reset_index(drop=True,inplace=True)  # reset index

In [33]:
df.head()

Unnamed: 0,LAST NAME,FIRST NAME,MIDDLE NAME,CLASSIFICATION,DEPARTMENT,RACE,GENDER,HIRE DATE,ANNUALIZED BASE PAY
0,RUSSELL,ANNA,A,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28 00:00:00,109468
1,JONES,JOHN,L,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30 00:00:00,22734
2,NORMAN,BENJAMIN,W,POLICE SERGEANT,POLICE,White,Male,1958-12-29 00:00:00,78114
3,HENRY,WILEY,E,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19 00:00:00,80142
4,HERNANDEZ,SYLVIA,T,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09 00:00:00,24585


## Normalize column names

In [34]:
# https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [35]:
df.head()

Unnamed: 0,last_name,first_name,middle_name,classification,department,race,gender,hire_date,annualized_base_pay
0,RUSSELL,ANNA,A,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28 00:00:00,109468
1,JONES,JOHN,L,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30 00:00:00,22734
2,NORMAN,BENJAMIN,W,POLICE SERGEANT,POLICE,White,Male,1958-12-29 00:00:00,78114
3,HENRY,WILEY,E,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19 00:00:00,80142
4,HERNANDEZ,SYLVIA,T,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09 00:00:00,24585


# Check for missing values

In [36]:
df.apply(lambda x: sum(x.isnull()))

last_name                 1
first_name                1
middle_name            5430
classification            0
department                0
race                      0
gender                    0
hire_date                 0
annualized_base_pay       0
dtype: int64

drop middle name

In [37]:
df.drop(['middle_name'], axis=1,inplace=True)

In [38]:
df.head()

Unnamed: 0,last_name,first_name,classification,department,race,gender,hire_date,annualized_base_pay
0,RUSSELL,ANNA,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28 00:00:00,109468
1,JONES,JOHN,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30 00:00:00,22734
2,NORMAN,BENJAMIN,POLICE SERGEANT,POLICE,White,Male,1958-12-29 00:00:00,78114
3,HENRY,WILEY,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19 00:00:00,80142
4,HERNANDEZ,SYLVIA,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09 00:00:00,24585


In [39]:
df.apply(lambda x: sum(x.isnull()))

last_name              1
first_name             1
classification         0
department             0
race                   0
gender                 0
hire_date              0
annualized_base_pay    0
dtype: int64

# show missing values

In [40]:
df[df.isnull().any(axis=1)][:10]  # display null rows

Unnamed: 0,last_name,first_name,classification,department,race,gender,hire_date,annualized_base_pay
7123,,BELINDA,POLICE LIEUTENANT,POLICE,White,Female,1996-10-28 00:00:00,87674
10812,YAO,,SUPERVISING ENGINEER,PUBLIC WORKS & ENGINEERING,Asian/Pacific Islander,Female,2003-07-21 00:00:00,96077


# replace nan with 'Uknown'

In [41]:
df.fillna('Unknown',inplace=True)

## combine names 
- output First initial full last name


In [42]:
def clean_name(data):
    f_name = data.first_name[0]
    l_name = data.last_name.title()
    full_name = '{}.{}'.format(f_name,l_name)
    return full_name

In [43]:
df.head()

Unnamed: 0,last_name,first_name,classification,department,race,gender,hire_date,annualized_base_pay
0,RUSSELL,ANNA,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28,109468
1,JONES,JOHN,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30,22734
2,NORMAN,BENJAMIN,POLICE SERGEANT,POLICE,White,Male,1958-12-29,78114
3,HENRY,WILEY,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19,80142
4,HERNANDEZ,SYLVIA,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09,24585


In [44]:
df['name'] = df.apply(clean_name,axis=1)
df.drop(['last_name','first_name'],axis=1,inplace=True)

In [45]:
df.head()

Unnamed: 0,classification,department,race,gender,hire_date,annualized_base_pay,name
0,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28,109468,A.Russell
1,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30,22734,J.Jones
2,POLICE SERGEANT,POLICE,White,Male,1958-12-29,78114,B.Norman
3,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19,80142,W.Henry
4,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09,24585,S.Hernandez


In [46]:
# column order
col_order =  ['name','classification','department','race','gender','hire_date','annualized_base_pay']
df = df.reindex(columns=col_order)

In [47]:
df.head()

Unnamed: 0,name,classification,department,race,gender,hire_date,annualized_base_pay
0,A.Russell,CITY SECRETARY,CITY SECRETARY,White,Female,1952-04-28,109468
1,J.Jones,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,1998-03-30,22734
2,B.Norman,POLICE SERGEANT,POLICE,White,Male,1958-12-29,78114
3,W.Henry,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,1979-03-19,80142
4,S.Hernandez,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,1998-03-09,24585


# get years of service

In [52]:
hired = df.hire_date.dt.year
current = 2014

In [56]:
df['tenure'] = current - hired
df.drop(['hire_date'],axis=1,inplace=True)

In [57]:
df.head()

Unnamed: 0,name,classification,department,race,gender,annualized_base_pay,tenure
0,A.Russell,CITY SECRETARY,CITY SECRETARY,White,Female,109468,62
1,J.Jones,RECREATION ASSISTANT,PARKS & RECREATION,Black or African American,Male,22734,16
2,B.Norman,POLICE SERGEANT,POLICE,White,Male,78114,56
3,W.Henry,CENTER ADMINISTRATOR,HEALTH & HUMAN SERVICES,Black or African American,Male,80142,35
4,S.Hernandez,SENIOR CLERK,HEALTH & HUMAN SERVICES,Hispanic/Latino,Female,24585,16


# save datafame

In [62]:
df.to_csv(data_directory_saves+'houston_2014.csv')

In [63]:
data_directory_saves

'../data/clean_data/houston/2014/'