In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from sklearn.utils import resample
#from sklearn.model_selection import test_train_split

In [2]:
df = pd.read_csv('hr_attrition.csv')

In [3]:
wages = ['DailyRate', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate']

to_dummy = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']##7

categorical = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##9

to_continous = ['Age', 'DistanceFromHome', 'HourlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
             'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 
             'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Education', 'EnvironmentSatisfaction',
                'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##11

to_drop = ['DailyRate', 'MonthlyIncome', 'MonthlyRate', 'Attrition',
           'EmployeeCount','Over18', 'StandardHours', 'EmployeeNumber']

In [4]:
###Get labels
def get_labels(df):
    labels = df['Attrition']
    labels = [1 if i == 'Yes' else 0 for i in labels]
    
    return labels
#labels = get_labels(df)

In [5]:
###Drop labels and others
def drop_from_df(df, to_drop):
    df = df.drop(to_drop, axis = 1)
    
    return df
#df = drop_from_df(df, to_drop)

In [6]:
###Get Dummies and drop dummy features
def get_dummies(df):
    dummies = pd.get_dummies(df[to_dummy])
    
    #print(dummies)
    df = df.drop(to_dummy, axis = 1)
    
    frames = [df, dummies]
    df = pd.concat(frames, axis = 1)
    
    return df

df_with_dummies = get_dummies(df)

In [7]:
len(df_with_dummies)

1470

In [8]:
###Scale continous variables
def get_scaled(df, continous):
    scaler = MinMaxScaler((0.05, 0.95))
    df[continous] = scaler.fit_transform(df[continous])
    
    return df

all_columns = df_with_dummies.columns.values
dummy_scaled_df = get_scaled(df_with_dummies, to_continous)

In [10]:
###Drop labels and others
labels = get_labels(df)
df = drop_from_df(dummy_scaled_df, to_drop)

In [11]:
split_num = int(len(labels)*.8)
labels,  test_labels = labels[:split_num],labels[split_num:]
features, test_features = df[:split_num], df[split_num:]

In [12]:
len(test_features)

294

In [14]:
assert(len(labels) == len(features))
assert(len(test_labels) == len(test_features))

In [19]:
labels = pd.DataFrame(labels)
test_labels = pd.DataFrame(test_labels)
labels.to_csv('hr_labels.csv', index = False)
features.to_csv('hr_features.csv', index = False)
test_labels.to_csv('hr_test_labels.csv', index = False)
test_features.to_csv('hr_test_features.csv', index = False)

In [29]:
def main():
    wages = ['DailyRate', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate']

    to_dummy = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']##7

    categorical = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##9

    to_continous = ['Age', 'DistanceFromHome', 'HourlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
             'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 
             'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Education', 'EnvironmentSatisfaction',
                'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##11

    to_drop = ['DailyRate', 'MonthlyIncome', 'MonthlyRate', 'Attrition',
           'EmployeeCount','Over18', 'StandardHours', 'EmployeeNumber']
    
    df = pd.read_csv('hr_attrition.csv')
    labels = get_labels(df)
    df = drop_from_df(df, to_drop)
    df_with_dummies = get_dummies(df)
    dummy_scaled_df = get_scaled(df_with_dummies, to_continous)
    labels = pd.DataFrame(labels)
    features = dummy_scaled_df.copy()
    
    assert len(labels) == len(features)
    labels.to_csv('hr_labels1.csv', index = False)
    features.to_csv('hr_features1.csv', index = False)

In [30]:
if __name__ == '__main__':
    main()