### Imports

In [3]:
import pandas as pd
from scipy.stats import wilcoxon
from scipy.stats import friedmanchisquare
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
from scipy.stats import chi2_contingency
from scipy.stats import ttest_1samp
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind

In [9]:
# remove null and duplicates
df = pd.read_csv('general_data.csv')
df = df.head().dropna().drop_duplicates()
df

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [10]:
df.describe()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,36.8,9.0,2.6,1.0,3.0,2.0,94592.0,1.8,14.4,8.0,1.8,6.8,3.6,5.0,1.6,3.2
std,8.408329,5.567764,1.81659,0.0,1.581139,1.414214,69009.137583,1.643168,5.07937,0.0,1.30384,4.494441,1.81659,2.54951,3.04959,1.923538
min,31.0,2.0,1.0,1.0,1.0,1.0,23420.0,0.0,11.0,8.0,0.0,1.0,2.0,1.0,0.0,0.0
25%,32.0,6.0,1.0,1.0,2.0,1.0,41890.0,1.0,11.0,8.0,1.0,5.0,2.0,5.0,0.0,3.0
50%,32.0,10.0,2.0,1.0,3.0,1.0,83210.0,1.0,12.0,8.0,2.0,6.0,3.0,5.0,0.0,4.0
75%,38.0,10.0,4.0,1.0,4.0,3.0,131160.0,3.0,15.0,8.0,3.0,9.0,5.0,6.0,1.0,4.0
max,51.0,17.0,5.0,1.0,5.0,4.0,193280.0,4.0,23.0,8.0,3.0,13.0,6.0,8.0,7.0,5.0


In [11]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [12]:
df_copy = df.copy()

In [15]:
df_copy['Attrition'] = df_copy['Attrition'].map({'Yes': 1, 'No': 1})
df_copy['BusinessTravel'] = df_copy['BusinessTravel'].map({'Non-Travel':1, 'Travel_Rarely':2, 'Travel_Frequently':3})

df_copy['MaritalStatus'] = df_copy['MaritalStatus'].map({'Single':1, 'Married':2, 'Divorced':3})

df_copy['Department'] = df_copy['Department'].map({'Sales':1, 'Research & Development':2, 'Human Resources':3})

df_copy['EducationField'] = df_copy['EducationField'].map({'Life Sciences':1, 'Medical':2, 
                                                             'Marketing':3, 'Technical Degree':4, 
                                                             'Human Resources':5, 'Other':6})

df_copy['Gender'] = df_copy['Gender'].map({'Male':1, 'Female':2})

df_copy['JobRole'] = df_copy['JobRole'].map({'Healthcare Representative':1, 
                                              'Research Scientist':2, 'Sales Executive':3, 
                                              'Human Resources':4, 'Research Director':5, 
                                              'Manufacturing Director':6 ,'Manager':7,
                                              'Sales Representative':8 ,'Laboratory Technician':9})

In [17]:
df_copy[['Attrition','BusinessTravel','MaritalStatus','Department','EducationField','Gender','JobRole']].head(10)

Unnamed: 0,Attrition,BusinessTravel,MaritalStatus,Department,EducationField,Gender,JobRole
0,1,2,2,1,1,2,1
1,1,3,1,2,1,2,2
2,1,3,2,2,6,1,3
3,1,1,2,2,1,1,4
4,1,2,1,2,2,1,3


## Non-Parametric Tests

### 1. wilcoxon - sign test

In [24]:
distanceFromHome1 = df_copy['DistanceFromHome'][:df_copy.shape[0]//2]
distanceFromHome2 = df_copy['DistanceFromHome'][df_copy.shape[0]//2 + 1:]

stat,p = wilcoxon(distanceFromHome1,distanceFromHome2)
print(stat,p)

0.0 0.31731050786291415


### 2. Friedman test

In [30]:
distanceFromHome1 = df_copy['DistanceFromHome'][:df_copy.shape[0]//3]
distanceFromHome2 = df_copy['DistanceFromHome'][df_copy.shape[0]//3*2+2:]
distanceFromHome3 = df_copy['DistanceFromHome'][df_copy.shape[0]//3*2+2:]

stat,p = friedmanchisquare(distanceFromHome1,distanceFromHome2,distanceFromHome3)
print(stat,p)

2.0 0.36787944117144245


### 3. Mann whitney test

In [37]:
yearsWithCurrManager = df_copy['YearsWithCurrManager']
education = df_copy['Education']

stat,p = mannwhitneyu(yearsWithCurrManager,education)
print(stat,p)

10.5 0.37478852569204263


### 4. kruskal wallis test

In [34]:
yearsWithCurrManager = df_copy['YearsWithCurrManager']
education = df_copy['Education']
numCompaniesWorked = df_copy['NumCompaniesWorked']

stat,p = kruskal(yearsWithCurrManager,education,numCompaniesWorked)
print(stat,p)

1.5694599627560473 0.45624287922832485


### 5. chi-sqaure Test

In [40]:
department = df_copy['Department']
businessTravel = df_copy['BusinessTravel']

chitable = pd.crosstab(department,businessTravel)
stat,p,dof,exepected = chi2_contingency(chitable)

print(stat,p)

1.875 0.39160562667679893


## Parametric Test

In [41]:
### 1. one sample t test

In [46]:
stats,p = ttest_1samp(df_copy.MonthlyIncome,8000) ##Assumed pop mean value: 80000 against monthly income
print(stats,p)

2.8057965233046493 0.0485241335645729


In [42]:
### 2. two sample t test

In [50]:
stats,p = ttest_rel(df_copy.YearsAtCompany, df_copy.YearsSinceLastPromotion) ##Considering YearAtCompany vs YearsSinceLastPromotion
print(stats,p)

3.3023719320146983 0.02986696837235546


In [47]:
### 3. two sample independent t test

In [51]:
# Not possible with given dataset 