In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

df = pd.read_csv('Data Final_Employee Attrition.csv')

In [2]:
#Data Description
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         1470 non-null   int64 
 1   Attrition                   1470 non-null   object
 2   BusinessTravel              1470 non-null   object
 3   Daily Rate                  1470 non-null   int64 
 4   Department                  1470 non-null   object
 5   Distance From Home          1470 non-null   int64 
 6   Education                   1470 non-null   int64 
 7   Education Field             1470 non-null   object
 8   Employee Number             1470 non-null   int64 
 9   Environment Satisfaction    1470 non-null   int64 
 10  Gender                      1470 non-null   object
 11  Hourly Rate                 1470 non-null   int64 
 12  Job Involvement             1470 non-null   int64 
 13  Job Level                   1470 non-null   int6

In [3]:
#Renaming columns - Lowercase and underscore column names
df = df.rename(columns={'BusinessTravel':'Business Travel'})
import re
df = df.rename(columns=lambda col: re.sub(r"\s+", "_", col.lower()))
df.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,...,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_current_manager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,...,4,80,1,6,3,3,2,2,2,2


In [None]:
def show_all_variable_values(df):
    variable_values = {}
    for column_name in df.columns:
        unique_values = df[column_name].unique()
        variable_values[column_name] = unique_values
    return variable_values

all_variable_values = show_all_variable_values(df)
for column_name, unique_values in all_variable_values.items():
    print(f"Values of variable '{column_name}':")
    print(unique_values)
    print()

Values of variable 'age':
[41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]

Values of variable 'attrition':
['Yes' 'No']

Values of variable 'business_travel':
['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']

Values of variable 'daily_rate':
[1102  279 1373 1392  591 1005 1324 1358  216 1299  809  153  670 1346
  103 1389  334 1123 1219  371  673 1218  419  391  699 1282 1125  691
  477  705  924 1459  125  895  813 1273  869  890  852 1141  464 1240
 1357  994  721 1360 1065  408 1211 1229  626 1434 1488 1097 1443  515
  853 1142  655 1115  427  653  989 1435 1223  836 1195 1339  664  318
 1225 1328 1082  548  132  746  776  193  397  945 1214  111  573 1153
 1400  541  432  288  669  530  632 1334  638 1093 1217 1353  120  682
  489  807  827  871  665 1040 1420  240 1280  534 1456  658  142 1127
 1031 1189 1354 1467  922  394 1312  750  441  684  249  841  147  528
  594  470  957  542  802 1355 11

In [None]:
#Find irrelevant data
def find_irrelevant_data(df):
    filtered_data = pd.DataFrame()
    
    for column_name in df.columns:
        if column_name == 'age':
            # Check for age values outside a reasonable range
            filtered_column = df[(df[column_name] <= 18) | (df[column_name] > 60)]
            
        elif column_name in ['attrition', 'over_time']:
            # Check for attrition and over_time values that are not in the expected categories
            expected_categories = ['Yes', 'No']
            filtered_column = df[~df[column_name].isin(expected_categories)]
            
        elif column_name in ['environment_satisfaction', 'job_involvement', 'job_satisfaction', 'relationship_satisfaction', 'work_life_balance']:
            expected_categories = [1, 2, 3, 4]
            filtered_column = df[~df[column_name].isin(expected_categories)]

        elif column_name in ['education', 'job_level']:
            expected_categories = [1, 2, 3, 4, 5]
            filtered_column = df[~df[column_name].isin(expected_categories)]
            
        elif column_name == 'gender':
            expected_categories = ['Male', 'Female']
            filtered_column = df[~df[column_name].isin(expected_categories)]
        
        elif column_name == 'stock_option_level':
            expected_categories = [0, 1, 2, 3]
            filtered_column = df[~df[column_name].isin(expected_categories)]
        
        else:
          return filtered_data

irrelevant_data = find_irrelevant_data(df)
print(irrelevant_data)

Empty DataFrame
Columns: []
Index: []


In [None]:
#Find dupplicates
def find_duplicates(df):
    duplicate_rows = df[df.duplicated()]
    return duplicate_rows

duplicate_rows = find_duplicates(df)
print(duplicate_rows)

Empty DataFrame
Columns: [age, attrition, business_travel, daily_rate, department, distance_from_home, education, education_field, employee_number, environment_satisfaction, gender, hourly_rate, job_involvement, job_level, job_role, job_satisfaction, marital_status, monthly_income, monthly_rate, num_companies_worked, over_18, over_time, percent_salary_hike, performance_rating, relationship_satisfaction, standard_hours, stock_option_level, total_working_years, training_times_last_year, work_life_balance, years_at_company, years_in_current_role, years_since_last_promotion, years_with_current_manager]
Index: []

[0 rows x 34 columns]


In [None]:
#Find missing values
df.isnull().sum()

age                           0
attrition                     0
business_travel               0
daily_rate                    0
department                    0
distance_from_home            0
education                     0
education_field               0
employee_number               0
environment_satisfaction      0
gender                        0
hourly_rate                   0
job_involvement               0
job_level                     0
job_role                      0
job_satisfaction              0
marital_status                0
monthly_income                0
monthly_rate                  0
num_companies_worked          0
over_18                       0
over_time                     0
percent_salary_hike           0
performance_rating            0
relationship_satisfaction     0
standard_hours                0
stock_option_level            0
total_working_years           0
training_times_last_year      0
work_life_balance             0
years_at_company              0
years_in

In [None]:
from scipy.stats import skew, kurtosis
from scipy.stats import sem

def standard_error_skewness(df):
    n = len(df)
    return np.sqrt(6 * n * (n - 1) / ((n - 2) * (n + 1) * (n + 3)))

def standard_error_kurtosis(df):
    n = len(df)
    return np.sqrt((24 * n * (n - 2) * (n - 3)) / ((n + 1) * (n + 3) * (n + 5) * (n + 7)))

numeric_columns = df.select_dtypes(include=[np.number])

statistics = pd.DataFrame()

statistics['count'] = numeric_columns.count()
statistics['mean'] = numeric_columns.mean()
statistics['median'] = numeric_columns.median()
statistics['mode'] = numeric_columns.mode().iloc[0]
statistics['skewness'] = numeric_columns.skew()
statistics['std_err_skewness'] = sem(numeric_columns, axis=0)
statistics['kurtosis'] = numeric_columns.kurtosis()
statistics['std_err_kurtosis'] = sem(numeric_columns**4, axis=0)

statistics

Unnamed: 0,count,mean,median,mode,skewness,std_err_skewness,kurtosis,std_err_kurtosis
age,1470,36.92381,36.0,35.0,0.413286,0.238269,-0.404145,67963.0
daily_rate,1470,802.485714,802.0,691.0,-0.003519,10.524335,-1.203823,35615940000.0
distance_from_home,1470,9.192517,7.0,2.0,0.958118,0.211443,-0.224833,4072.966
education,1470,2.912925,3.0,3.0,-0.289681,0.026712,-0.559115,3.463312
employee_number,1470,1024.865306,1020.5,1.0,0.016574,15.702015,-1.223179,127253900000.0
environment_satisfaction,1470,2.721769,3.0,3.0,-0.321654,0.02851,-1.202521,2.702715
hourly_rate,1470,65.891156,66.0,66.0,-0.032311,0.530233,-1.196398,748165.8
job_involvement,1470,2.729932,3.0,3.0,-0.498419,0.018559,0.270999,1.735294
job_level,1470,2.063946,2.0,1.0,1.025401,0.028871,0.399152,3.668793
job_satisfaction,1470,2.728571,3.0,4.0,-0.329672,0.028764,-1.222193,2.726648
