In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
df.shape

(1628, 29)

In [6]:
df.columns

Index(['Id', 'Age', 'Attrition', 'BusinessTravel', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

In [7]:
cols_to_drop = ['Id', 'Gender', 'Behaviour', 'MaritalStatus', 'Education', 'Department', 'PerformanceRating',
                'TrainingTimesLastYear', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [8]:
df_cleaned = df.drop(cols_to_drop, axis = 1)
len(df_cleaned.columns)

18

In [9]:
object_type = df_cleaned.select_dtypes(include='object').columns

In [10]:
object_type_vals = dict()
for col in object_type:
    object_type_vals[col] = df_cleaned[col].unique().tolist()
with open('data/object_col_unique_vals.json', 'w') as fp:
    json.dump(object_type_vals, fp)

In [11]:
with open('data/object_col_unique_vals.json', 'r') as fp:
    object_type_vals = json.load(fp)
object_type_vals

{'BusinessTravel': ['Non-Travel', 'Travel_Rarely', 'Travel_Frequently'],
 'EducationField': ['Medical',
  'Life Sciences',
  'Other',
  'Marketing',
  'Technical Degree',
  'Human Resources'],
 'JobRole': ['Laboratory Technician',
  'Manufacturing Director',
  'Sales Executive',
  'Research Scientist',
  'Sales Representative',
  'Healthcare Representative',
  'Research Director',
  'Human Resources',
  'Manager'],
 'OverTime': ['No', 'Yes']}

In [12]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='error', sparse=False, drop='first')

In [13]:
enc.fit(df_cleaned[[object_type[0]]])

OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

In [15]:
ncols = object_type_vals['BusinessTravel']
ncols.sort()
for i in range(len(ncols)):
    ncols[i] = 'BusinessTravel_'+ncols[i]
ncols = ncols[1:]

In [16]:
x = pd.DataFrame(enc.transform(df_cleaned[[object_type[0]]]), columns=ncols)
x

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely
0,0.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
1623,1.0,0.0
1624,0.0,1.0
1625,0.0,1.0
1626,0.0,1.0


In [17]:
df_new = df_cleaned.join(x, on=x.index)

In [18]:
df_new[['BusinessTravel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely']]

Unnamed: 0,BusinessTravel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely
0,Non-Travel,0.0,0.0
1,Travel_Rarely,0.0,1.0
2,Travel_Rarely,0.0,1.0
3,Travel_Rarely,0.0,1.0
4,Travel_Rarely,0.0,1.0
...,...,...,...
1623,Travel_Frequently,1.0,0.0
1624,Travel_Rarely,0.0,1.0
1625,Travel_Rarely,0.0,1.0
1626,Travel_Rarely,0.0,1.0
