##  Data Processing
### Import Packages 

In [10]:
# data analysis and wrangling
import pandas as pd
import numpy as np


# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

### Acquire data

In [11]:
file = r'INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls'
original_data= pd.read_excel(file)
original_data

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3
5,E1001011,27,Male,Life Sciences,Divorced,Development,Developer,Travel_Frequently,10,2,...,3,9,4,2,9,7,1,7,No,4
6,E1001016,50,Male,Marketing,Married,Sales,Sales Representative,Travel_Rarely,8,4,...,4,4,2,3,2,2,2,2,No,3
7,E1001019,28,Female,Life Sciences,Single,Development,Developer,Travel_Rarely,1,2,...,4,10,4,3,7,7,3,7,Yes,3
8,E1001020,36,Female,Life Sciences,Married,Development,Developer,Non-Travel,8,3,...,1,10,2,3,8,7,0,5,No,3
9,E1001021,38,Female,Life Sciences,Single,Development,Developer,Travel_Rarely,1,3,...,4,10,4,4,1,0,0,0,No,3


### Analyze Features by describing data

In [12]:
print(original_data.columns.values) # To find the features present in the data set

['EmpNumber' 'Age' 'Gender' 'EducationBackground' 'MaritalStatus'
 'EmpDepartment' 'EmpJobRole' 'BusinessTravelFrequency' 'DistanceFromHome'
 'EmpEducationLevel' 'EmpEnvironmentSatisfaction' 'EmpHourlyRate'
 'EmpJobInvolvement' 'EmpJobLevel' 'EmpJobSatisfaction'
 'NumCompaniesWorked' 'OverTime' 'EmpLastSalaryHikePercent'
 'EmpRelationshipSatisfaction' 'TotalWorkExperienceInYears'
 'TrainingTimesLastYear' 'EmpWorkLifeBalance'
 'ExperienceYearsAtThisCompany' 'ExperienceYearsInCurrentRole'
 'YearsSinceLastPromotion' 'YearsWithCurrManager' 'Attrition'
 'PerformanceRating']


### Shape of the Data 

it has 28 Features and 1200 employees

In [13]:
original_data.shape  # to find out shape of the data 

(1200, 28)

### Data types of the features

In [14]:
original_data.info() # To know the types of data for each features 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
EmpNumber                       1200 non-null object
Age                             1200 non-null int64
Gender                          1200 non-null object
EducationBackground             1200 non-null object
MaritalStatus                   1200 non-null object
EmpDepartment                   1200 non-null object
EmpJobRole                      1200 non-null object
BusinessTravelFrequency         1200 non-null object
DistanceFromHome                1200 non-null int64
EmpEducationLevel               1200 non-null int64
EmpEnvironmentSatisfaction      1200 non-null int64
EmpHourlyRate                   1200 non-null int64
EmpJobInvolvement               1200 non-null int64
EmpJobLevel                     1200 non-null int64
EmpJobSatisfaction              1200 non-null int64
NumCompaniesWorked              1200 non-null int64
OverTime                        1200 non-null object
E

### Distribution of Categorical features

In [15]:
original_data.describe(include=['O']) # To see the Distribution of Categorical features

Unnamed: 0,EmpNumber,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,OverTime,Attrition
count,1200,1200,1200,1200,1200,1200,1200,1200,1200
unique,1200,2,6,3,6,19,3,2,2
top,E1002156,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Rarely,No,No
freq,1,725,492,548,373,270,846,847,1022


###  Distribution of Numerical features


In [16]:
original_data.describe() # To know the max , min, std of values 

Unnamed: 0,Age,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,EmpJobLevel,EmpJobSatisfaction,NumCompaniesWorked,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,PerformanceRating
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,36.918333,9.165833,2.8925,2.715833,65.981667,2.731667,2.0675,2.7325,2.665,15.2225,2.725,11.33,2.785833,2.744167,7.0775,4.291667,2.194167,4.105,2.948333
std,9.087289,8.176636,1.04412,1.090599,20.211302,0.707164,1.107836,1.100888,2.469384,3.625918,1.075642,7.797228,1.263446,0.699374,6.236899,3.613744,3.22156,3.541576,0.518866
min,18.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,0.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
25%,30.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,1.0,12.0,2.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,3.0
50%,36.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,2.0,14.0,3.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0,3.0
75%,43.0,14.0,4.0,4.0,83.0,3.0,3.0,4.0,4.0,18.0,4.0,15.0,3.0,3.0,10.0,7.0,3.0,7.0,3.0
max,60.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,9.0,25.0,4.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0,4.0


### Data Cleaning 

In [17]:
original_data.isna().values.any() # To find out NaN values 

False

In [18]:
original_data.isnull().values.any() # To find out Null values 

False

There is no NaN or Null values present in the Data Set 