In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# allow max columns to be displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

In [2]:
# Files to load
employee_survey_data = "HR_Data/employee_survey_data.csv"
manager_survey_data = "HR_Data/manager_survey_data.csv"
in_time = "HR_Data/in_time.csv"
out_time = "HR_Data/out_time.csv"
general_data = "HR_Data/general_data.csv"

In [3]:
# Read the employee/manager survey data files and store in a Dataframe.

employee_survey_df = pd.read_csv(employee_survey_data)
manager_survey_df = pd.read_csv(manager_survey_data)
in_time_df = pd.read_csv(in_time)
out_time_df = pd.read_csv(out_time)
general_data_df = pd.read_csv(general_data)

In [4]:
employee_survey_df.head()

Unnamed: 0,EmployeeID,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance
0,1,3.0,4.0,2.0
1,2,3.0,2.0,4.0
2,3,2.0,2.0,1.0
3,4,4.0,4.0,3.0
4,5,4.0,1.0,3.0


In [5]:
manager_survey_df.head()

Unnamed: 0,EmployeeID,JobInvolvement,PerformanceRating
0,1,3,3
1,2,2,4
2,3,3,3
3,4,2,3
4,5,3,3


In [6]:
in_time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Columns: 262 entries, Unnamed: 0 to 2015-12-31
dtypes: float64(12), int64(1), object(249)
memory usage: 8.8+ MB


In [7]:
out_time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Columns: 262 entries, Unnamed: 0 to 2015-12-31
dtypes: float64(12), int64(1), object(249)
memory usage: 8.8+ MB


In [None]:
#Combine the employee and manager survey dfs inta a single df.

survey_df = pd.merge(employee_survey_df, manager_survey_df, on=["EmployeeID", "EmployeeID"])

survey_df.head()

In [None]:
survey_df.info()

In [None]:
survey_general_df = pd.merge(survey_df, general_data_df, on=["EmployeeID", "EmployeeID"])

survey_general_df.head()

In [None]:
survey_general_df.drop('EmployeeCount', axis=1, inplace=True)

In [None]:
survey_general_df.info()

In [None]:
survey_general_df.dropna(inplace=True)

In [None]:
survey_general_df.info()

In [None]:
in_time_df.dtypes

In [None]:
survey_general_df["MonthlyIncome"] = survey_general_df["MonthlyIncome"].apply(lambda x : x * 0.013)
survey_general_df.head()

In [None]:
survey_general_df["DistanceFromHome"] = survey_general_df["DistanceFromHome"].apply(lambda x : x / 1.6)
survey_general_df.info()

In [None]:
# Save as a new CSV

survey_general_df.to_csv("Clean_HRData.csv")

In [None]:
# OneHotEncoder

attrition_cat = survey_general_df.dtypes[survey_general_df.dtypes == 'object'].index.to_list()
survey_general_df[attrition_cat].nunique()

In [None]:
# OneHotEncode cat data
enc = OneHotEncoder(sparse=False)

# Fit & transform the enc using the cat list
encode_df = pd.DataFrame(enc.fit_transform(survey_general_df[attrition_cat]))

# Add encoded vars to DF
encode_df.columns = enc.get_feature_names_out(attrition_cat)
encode_df.info()

In [None]:
# Merge the encoded dataframe with the OG dataframe, then drop the categorical columns
hr_df = survey_general_df.merge(encode_df, how="inner", left_index=True, right_index=True)
hr_df = hr_df.drop(attrition_cat, 1)
hr_df.info()

In [None]:
hr_df.info()

In [None]:
hr_df.to_csv("Clean_HRData_ML.csv")