In [6]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import ttest_1samp , ttest_ind , mannwhitneyu, levene , shapiro , bartlett
from scipy.stats import chisquare,chi2_contingency , f_oneway
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df1 = pd.read_csv("HR.csv")

In [7]:
df1.shape

(1470, 35)

In [8]:
df1.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [9]:
df1.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [10]:
#Attrition rate means employee leaving the organization
df1["Attrition"].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [11]:
df1["Gender"].value_counts()

Male      882
Female    588
Name: Gender, dtype: int64

In [13]:
#Q1) check whether there is gender impact on attrition or not

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,501,732
Yes,87,150


In [None]:
#step 1: generate cross tab
ct = pd.crosstab(df1["Attrition"],df1["Gender"])
ct

In [14]:
#step 2: decide whther its a test of mean or test of proportion
#Categorical(attrition) vs categorical(gender) ==> test of proportion
from statsmodels.stats.proportion import proportions_ztest

In [15]:
p1 = 87/588
p2 = 150/882
p1,p2
#p1 is 41% females left the organization
#p2 is 17% males left the organization

(0.14795918367346939, 0.17006802721088435)

In [None]:
#step 3: Hypothesis
#H0:  p1  = p2
#Ha:  p1 != P2

In [17]:
#step 4: define x-array and n-array  to pass them as parameter to proportions_ztest()
#X-array = numerator of p1 and p2
#y-array = denominator of p1 and p2
x  = np.array([87,150])
n  = np.array([588,882])

In [18]:
#step 5: perform the test and check the pvalue
proportions_ztest(x,n)

(-1.1292547809155016, 0.2587903704911598)

In [None]:
#Pval > 0.25 , we fail to reject H0 , Gender is not influencing the atrrition

In [None]:
#Q2) check whether atrrition is influenced by department

In [19]:
#step 1: create crosstab
ct = pd.crosstab(df1["Attrition"],df1["Department"])
ct

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,51,828,354
Yes,12,133,92


In [21]:
#step 2 : decide the test 
df1["Department"].value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

In [None]:
#As department has more than 2 categories we will go for chi square

In [None]:
#step 3: hypothesis
#H0  : proprotion_attr_hr(yes) = proprotion_attr_r&d(yes) = proprotion_attr_sales(yes)
#Ha  : proprotion_attr_r&d != proprotion_attr_sales != proprotion_attr_hr

In [23]:
#step 4: perform the test
p1 = 12/63
p2 = 133/961
p3 = 92/446
print(p1,p2,p3)
x = np.array([12,133,92])
n = np.array([63,961,446])
chisquare(x,n)

0.19047619047619047 0.1383975026014568 0.2062780269058296


Power_divergenceResult(statistic=1035.6701606070444, pvalue=1.2796231495103994e-225)

In [None]:
#PVal < 0.05 , we reject H0 , attrition rate is different in different departments

In [None]:
#SUBQUESTION) Identify which department more employees are leaving

In [24]:
print(p1,p2,p3)

0.19047619047619047 0.1383975026014568 0.2062780269058296


In [None]:
#Department Research and development manager is very good because attrition rate is very low
#as compared to other 2 departments

In [None]:
#Q3)IS ther any disperency in monthly income avg with respect to gender? 

In [None]:
#Whenever you encounter atleast 1 numerical column  2 samp proportiontest and chisquare is ruled out
#step 1: Income is continous so 2 samp proportiontest and chisquare is ruled out

In [35]:
g1 = df1["MonthlyIncome"][df1["Gender"] == "Male"]
g2 = df1["MonthlyIncome"][df1["Gender"] == "Female"]

In [36]:
g1.mean(),g2.mean()

(6380.507936507936, 6686.566326530612)

In [37]:
#step 2: Apply shapiro test
#H0 :  g1,g2  = Normal distribution
#Ha :  g1,g2  != Normal distribution
shapiro(g1) , shapiro(g2)

(ShapiroResult(statistic=0.8169718384742737, pvalue=1.709933327621381e-30),
 ShapiroResult(statistic=0.8416616916656494, pvalue=9.378564564919968e-24))

In [None]:
#Pval of g1,g2  < 0.05 , we reject H0, g1 ,g1 has skewness

In [38]:
#step 3: Decide the tes
#From above inference we need to apply non parametric independent test mannwhtineyu
mannwhitneyu(g1,g2)

MannwhitneyuResult(statistic=245722.5, pvalue=0.044208341633010594)

In [None]:
#pval < 0.05 , reject Ho , means there is difference in the salary of male and female

In [None]:
#Q4) whteher particular dept getting more salary

In [40]:
g1 = df1["MonthlyIncome"][df1["Department"] == "Research & Development"]
g2 = df1["MonthlyIncome"][df1["Department"] == "Sales"]
g3 = df1["MonthlyIncome"][df1["Department"] == "Human Resources"]

In [43]:
f_oneway(g1,g2,g3)

F_onewayResult(statistic=3.2017829294201707, pvalue=0.04097409724987449)

In [None]:
#pbal < 0.05 , reject H0 , means Each departments has different incomes

In [44]:
#which department is getting more salary
g1.mean(),g2.mean(),g3.mean()

(6281.252861602497, 6959.17264573991, 6654.507936507936)

In [None]:
# g1 that is R&D has highest salary

In [None]:
#Q5)Whether monthlyincome is affected by the education

In [46]:
df1["Education"].value_counts()

3    572
4    398
2    282
1    170
5     48
Name: Education, dtype: int64

In [48]:
g1 = df1["MonthlyIncome"][df1["Education"]== 1]
g2 = df1["MonthlyIncome"][df1["Education"]== 2]
g3 = df1["MonthlyIncome"][df1["Education"]== 3]
g4 = df1["MonthlyIncome"][df1["Education"]== 4]
g5 = df1["MonthlyIncome"][df1["Education"]== 5]

In [None]:
#there is a significant difference in avg sal wrt to education

# Ridge , Lasso , ElasticNet

In [7]:
df2 = pd.read_csv("Boston.csv",index_col = 0)

In [8]:
df2.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [9]:
df2.shape

(506, 14)

In [11]:
x = df2.drop("medv" , axis = 1)
y = df2["medv"]

In [4]:
from sklearn.linear_model import Ridge , Lasso , ElasticNet
from sklearn.linear_model import LinearRegression

In [5]:
m1 = LinearRegression()
m2 = Ridge(alpha = 0.5 , normalize = True) 
#normalize means scaling the data
#Why we need normalize , to convert all the features in one scale for the comparision



In [12]:
x.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [13]:
x.shape

(506, 13)

In [None]:
#refer day 2

5