# Preriquisites


We will work with Supervised Machine learning. We will be training our dataframe against the 'is_promoted' column.
We will drop the employee_id column is this is not be of importance at this point.

In [1]:
# Import pandas for data manipulation
import pandas as pd
import numpy as np


In [2]:
# load datafile and preview first few records
hr_df = pd.read_csv('https://bit.ly/2ODZvLCHRDataset')
hr_df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [None]:
# load glossary dataframe and view the records
glossary_df = pd.read_csv('https://bit.ly/2Wz3sWcGlossary')
glossary_df

Unnamed: 0,employee_id - Unique ID for employee
0,department - Department of employee
1,region - Region of employment (unordered)
2,education - Education Level
3,gender - Gender of Employee
4,recruitment_channel - Channel of recruitment f...
5,nooftrainings - no of other trainings complete...
6,age - Age of Employee
7,previousyearrating - Employee Rating for the p...
8,lengthofservice - Length of service in years
9,KPIs_met >80% - if Percent of KPIs(Key perform...


In [12]:
# Select and preview unique departments/verticals
hr_df.department.unique().tolist()

['Sales & Marketing',
 'Operations',
 'Technology',
 'Analytics',
 'R&D',
 'Procurement',
 'Finance',
 'HR',
 'Legal']

In [13]:
# Select and preview unique recommeded for promotion
hr_df.is_promoted.unique().tolist()

[0, 1]

In [14]:
# check datatypes for the various columns
hr_df.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [15]:
# check if there are null observations in the dataset
hr_df.isnull().any()

employee_id             False
department              False
region                  False
education                True
gender                  False
recruitment_channel     False
no_of_trainings         False
age                     False
previous_year_rating     True
length_of_service       False
KPIs_met >80%           False
awards_won?             False
avg_training_score      False
is_promoted             False
dtype: bool

In [17]:
hr_df[hr_df.duplicated(['employee_id'])]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted


In [19]:
hr_df.shape

(54808, 14)

In [20]:
unique_df = hr_df.drop_duplicates(['employee_id'])

In [21]:
unique_df.shape

(54808, 14)

# Non-numeric data conversion to numeric data

In [5]:
# Iterate through the columns in the dataframe and find unique elements for non numeric columns. We will take a set  of the column values and 
# thus the set within the index within the set will be the new numerical value or id of that non numerical observation.

# create a function that gets the columns and interate through them

def handle_non_numerical_data(hr_df):
    columns = hr_df.columns.values
    for column in columns:

# Embed a function that converts the parameter value to the any value of that item (as Key) from the text_digit_val dictionary

        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

# During iteration through the columns, check and pick columns which are not int64 or float64 and then convert the column to list of its values
        if hr_df[column].dtype != np.int64 and hr_df[column].dtype != np.float64:
            column_contents = hr_df[column].values.tolist()

# Take a set of the columns and extract the unique values only.            
            unique_elements = set(column_contents)

# Create a new dictionary key for each of the unique values found with avalye of a new number.
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

# Use the map function to perform mapping of the new values into the columns
            hr_df[column] = list(map(convert_to_int, hr_df[column]))

    return hr_df


# Call our handle_non_numerical_data function and preview the newly converted data frame

hr_df = handle_non_numerical_data(hr_df)
print(hr_df.head())

   employee_id  department  ...  avg_training_score  is_promoted
0        65438           4  ...                  49            0
1        65141           3  ...                  60            0
2         7513           4  ...                  50            0
3         2542           4  ...                  50            0
4        48945           0  ...                  73            0

[5 rows x 14 columns]
