# State
## 2014

public information act  Texas Public Information Act.
https://www.texasattorneygeneral.gov/sites/default/files/2018-06/PIA_handbook_2018_0.pdf
    
    https://en.wikipedia.org/wiki/Texas_Public_Information_Act
    
    

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

# Data directory

In [7]:
location = 'state'
year = '2014'

In [8]:
# data folder path
data_directory = os.path.join('..','data','raw_data/{}/{}/'.format(location,year))
data_directory_saves = os.path.join( '..','data','clean_data','{}/{}/'.format(location,year))

In [9]:
# create directory
if not os.path.exists(data_directory_saves):
    os.makedirs(data_directory_saves)

## Create dataframe

In [10]:
# combine all files into one df
all_files = glob.glob(os.path.join(data_directory, "*.xlsx"))[0] 
df = pd.read_excel(all_files)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148403 entries, 0 to 148402
Data columns (total 11 columns):
AGENCY           148403 non-null int64
 AGENCY NAME     148403 non-null object
LAST NAME        148403 non-null object
FIRST NAME       148403 non-null object
CLASS TITLE      148403 non-null object
ETHNICITY        148403 non-null object
GENDER           148403 non-null object
EMPLOYEE TYPE    148403 non-null object
HIRE DATE        148403 non-null object
ANNUAL SALARY    148403 non-null float64
STATE NUMBER     148403 non-null int64
dtypes: float64(1), int64(2), object(8)
memory usage: 12.5+ MB


In [12]:
df.head()

Unnamed: 0,AGENCY,AGENCY NAME,LAST NAME,FIRST NAME,CLASS TITLE,ETHNICITY,GENDER,EMPLOYEE TYPE,HIRE DATE,ANNUAL SALARY,STATE NUMBER
0,101,SENATE ...,ACOSTA,SARAH,LEG. OFFICIAL/ADMINISTRATOR ...,HISPANIC,FEMALE,URP,02/11/2004,51000.0,59341
1,101,SENATE ...,AHLHORN,KURT,LEGISLATIVE PROFESSIONAL ...,WHITE,MALE,URF,01/02/2013,34200.0,341347
2,101,SENATE ...,AKPAN,ERIKA,LEG. OFFICIAL/ADMINISTRATOR ...,BLACK,FEMALE,URF,08/01/2005,51600.0,6093
3,101,SENATE ...,ALBRIGHT,STEVEN,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP,01/09/2007,85800.0,57927
4,101,SENATE ...,ALEXANDER,LEAH,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URF,01/08/2013,66000.0,11289


## Normalize column names

In [13]:
# https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [14]:
df.head()

Unnamed: 0,agency,agency_name,last_name,first_name,class_title,ethnicity,gender,employee_type,hire_date,annual_salary,state_number
0,101,SENATE ...,ACOSTA,SARAH,LEG. OFFICIAL/ADMINISTRATOR ...,HISPANIC,FEMALE,URP,02/11/2004,51000.0,59341
1,101,SENATE ...,AHLHORN,KURT,LEGISLATIVE PROFESSIONAL ...,WHITE,MALE,URF,01/02/2013,34200.0,341347
2,101,SENATE ...,AKPAN,ERIKA,LEG. OFFICIAL/ADMINISTRATOR ...,BLACK,FEMALE,URF,08/01/2005,51600.0,6093
3,101,SENATE ...,ALBRIGHT,STEVEN,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP,01/09/2007,85800.0,57927
4,101,SENATE ...,ALEXANDER,LEAH,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URF,01/08/2013,66000.0,11289


# Check for missing values

In [15]:
df.apply(lambda x: sum(x.isnull()))

agency           0
agency_name      0
last_name        0
first_name       0
class_title      0
ethnicity        0
gender           0
employee_type    0
hire_date        0
annual_salary    0
state_number     0
dtype: int64

## combine names 
- output First initial full last name


In [16]:
def clean_name(data):
    f_name = data.first_name[0]
    l_name = data.last_name.title()
    full_name = '{}.{}'.format(f_name,l_name)
    return full_name

In [17]:
df.head()

Unnamed: 0,agency,agency_name,last_name,first_name,class_title,ethnicity,gender,employee_type,hire_date,annual_salary,state_number
0,101,SENATE ...,ACOSTA,SARAH,LEG. OFFICIAL/ADMINISTRATOR ...,HISPANIC,FEMALE,URP,02/11/2004,51000.0,59341
1,101,SENATE ...,AHLHORN,KURT,LEGISLATIVE PROFESSIONAL ...,WHITE,MALE,URF,01/02/2013,34200.0,341347
2,101,SENATE ...,AKPAN,ERIKA,LEG. OFFICIAL/ADMINISTRATOR ...,BLACK,FEMALE,URF,08/01/2005,51600.0,6093
3,101,SENATE ...,ALBRIGHT,STEVEN,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP,01/09/2007,85800.0,57927
4,101,SENATE ...,ALEXANDER,LEAH,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URF,01/08/2013,66000.0,11289


In [18]:
df['name'] = df.apply(clean_name,axis=1)
df.drop(['last_name','first_name'],axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,agency,agency_name,class_title,ethnicity,gender,employee_type,hire_date,annual_salary,state_number,name
0,101,SENATE ...,LEG. OFFICIAL/ADMINISTRATOR ...,HISPANIC,FEMALE,URP,02/11/2004,51000.0,59341,S.Acosta
1,101,SENATE ...,LEGISLATIVE PROFESSIONAL ...,WHITE,MALE,URF,01/02/2013,34200.0,341347,K.Ahlhorn
2,101,SENATE ...,LEG. OFFICIAL/ADMINISTRATOR ...,BLACK,FEMALE,URF,08/01/2005,51600.0,6093,E.Akpan
3,101,SENATE ...,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP,01/09/2007,85800.0,57927,S.Albright
4,101,SENATE ...,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URF,01/08/2013,66000.0,11289,L.Alexander


## Lets lowercase some of these values

In [21]:
df.agency_name.value_counts()[:10]

TEXAS DEPARTMENT OF CRIMINAL JUSTICE                  38185
DEPARTMENT OF AGING AND DISABILITY SERVICES           16039
HEALTH AND HUMAN SERVICES COMMISSION                  12071
DEPARTMENT OF STATE HEALTH SERVICES                   11993
TEXAS DEPARTMENT OF TRANSPORTATION                    11637
DEPARTMENT OF FAMILY AND PROTECTIVE SERVICES          11306
DEPARTMENT OF PUBLIC SAFETY                            9140
OFFICE OF THE ATTORNEY GENERAL                         4067
TEXAS WORKFORCE COMMISSION                             2947
PARKS AND WILDLIFE DEPARTMENT                          2919
Name: agency_name, dtype: int64

In [26]:
df.agency_name = df.agency_name.str.strip().str.title()  # Name

In [27]:
df.agency_name.value_counts()[:10]

Texas Department Of Criminal Justice            38185
Department Of Aging And Disability Services     16039
Health And Human Services Commission            12071
Department Of State Health Services             11993
Texas Department Of Transportation              11637
Department Of Family And Protective Services    11306
Department Of Public Safety                      9140
Office Of The Attorney General                   4067
Texas Workforce Commission                       2947
Parks And Wildlife Department                    2919
Name: agency_name, dtype: int64

In [29]:
df.class_title.value_counts()[:10]

CORREC  OFFICER V                                     8557
CORREC  OFFICER IV                                    6532
CORREC OFFCR III                                      5904
DIRECT SUPPORT PROFESSIONAL I                         4314
TEXAS WORKS ADVISOR II                                3910
ADMINISTRATIVE ASST III                               2307
ADMINISTRATIVE ASST II                                2188
CORREC OFFCR II                                       2040
CLERK II                                              1943
CLERK III                                             1891
Name: class_title, dtype: int64

In [30]:
df.class_title = df.class_title.str.title().str.strip()

In [31]:
df.class_title.value_counts()[:10]

Correc  Officer V                8557
Correc  Officer Iv               6532
Correc Offcr Iii                 5904
Direct Support Professional I    4314
Texas Works Advisor Ii           3910
Administrative Asst Iii          2307
Administrative Asst Ii           2188
Correc Offcr Ii                  2040
Clerk Ii                         1943
Clerk Iii                        1891
Name: class_title, dtype: int64

In [32]:
df.class_title.unique()

array(['Leg. Official/Administrator', 'Legislative Professional',
       'Legislative Admin. Support', ..., 'Database Admin V',
       'Program Spec Iii', 'Chief Financial Officer Ii'], dtype=object)

In [34]:
df.ethnicity.unique()

array(['HISPANIC       ', 'WHITE          ', 'BLACK          ',
       'ASIAN          ', 'AM INDIAN      ', 'OTHER          '],
      dtype=object)

In [35]:
df.ethnicity.value_counts()

WHITE              73796
HISPANIC           37139
BLACK              33585
ASIAN               3133
AM INDIAN            746
OTHER                  4
Name: ethnicity, dtype: int64

In [36]:
df.ethnicity = df.ethnicity.str.lower().str.strip()  # name

In [37]:
df.ethnicity.unique()

array(['hispanic', 'white', 'black', 'asian', 'am indian', 'other'],
      dtype=object)

In [38]:
df.ethnicity.value_counts()

white        73796
hispanic     37139
black        33585
asian         3133
am indian      746
other            4
Name: ethnicity, dtype: int64

In [39]:
df.gender.value_counts()

FEMALE             83776
MALE               64627
Name: gender, dtype: int64

In [40]:
df.gender.unique()

array(['FEMALE         ', 'MALE           '], dtype=object)

In [41]:
df.gender = df.gender.str.lower().str.strip()

In [42]:
df.gender.unique()

array(['female', 'male'], dtype=object)

In [43]:
df.gender.value_counts()

female    83776
male      64627
Name: gender, dtype: int64

In [45]:
df.state_number.value_counts()[:10]

85639     2
84524     2
147903    2
152770    2
102292    2
47114     2
190648    2
7101      2
97263     2
181543    2
Name: state_number, dtype: int64

In [46]:
df.state_number.unique()

array([ 59341, 341347,   6093, ...,  57405,  70518, 192939])

In [58]:
df.employee_type.unique()

array(['URP      ', 'URF      ', 'UTP      ', 'UTF      ', 'CRF      ',
       'ERF      ', 'CRP      ', 'CTF      ', 'CTP      '], dtype=object)

In [60]:
df.employee_type = df.employee_type.str.strip()

In [61]:
df.employee_type.unique()

array(['URP', 'URF', 'UTP', 'UTF', 'CRF', 'ERF', 'CRP', 'CTF', 'CTP'],
      dtype=object)

# get tenure

In [47]:
df.hire_date.dtype

dtype('O')

In [48]:
# convert to datetime
df.hire_date = pd.to_datetime(df.hire_date)

In [49]:
df.hire_date.dtype

dtype('<M8[ns]')

## Tenure
- current year : 2014 - `hire_date` year

In [50]:
hired = df.hire_date.dt.year
current = 2014

In [51]:
df['tenure'] = current - hired
df.drop(['hire_date'],axis=1,inplace=True)

In [52]:
df.head()

Unnamed: 0,agency,agency_name,class_title,ethnicity,gender,employee_type,annual_salary,state_number,name,tenure
0,101,Senate,Leg. Official/Administrator,hispanic,female,URP,51000.0,59341,S.Acosta,10
1,101,Senate,Legislative Professional,white,male,URF,34200.0,341347,K.Ahlhorn,1
2,101,Senate,Leg. Official/Administrator,black,female,URF,51600.0,6093,E.Akpan,9
3,101,Senate,Leg. Official/Administrator,white,male,URP,85800.0,57927,S.Albright,7
4,101,Senate,Leg. Official/Administrator,white,female,URF,66000.0,11289,L.Alexander,1


# order columns

In [53]:
col_order = ['name','agency','agency_name', 'class_title', 'ethnicity', 'gender', 'employee_type', 'annual_salary', 'state_number', 'tenure']
# column order
df = df.reindex(columns=col_order)

In [62]:
df.head()

Unnamed: 0,name,agency,agency_name,class_title,ethnicity,gender,employee_type,annual_salary,state_number,tenure
0,S.Acosta,101,Senate,Leg. Official/Administrator,hispanic,female,URP,51000.0,59341,10
1,K.Ahlhorn,101,Senate,Legislative Professional,white,male,URF,34200.0,341347,1
2,E.Akpan,101,Senate,Leg. Official/Administrator,black,female,URF,51600.0,6093,9
3,S.Albright,101,Senate,Leg. Official/Administrator,white,male,URP,85800.0,57927,7
4,L.Alexander,101,Senate,Leg. Official/Administrator,white,female,URF,66000.0,11289,1


# save datafame

In [63]:
df.to_csv(data_directory_saves+'state_2014.csv',index=False)

In [64]:
data_directory_saves

'../data/clean_data/state/2014/'