In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("conspan_clean.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1172 entries, 0 to 1171
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Employee ID               1172 non-null   int64  
 1   Age                       1172 non-null   float64
 2   Department                1172 non-null   object 
 3   DistanceFromHome          1172 non-null   int64  
 4   Education                 1172 non-null   float64
 5   EnvironmentSatisfaction   1172 non-null   float64
 6   Gender                    1172 non-null   object 
 7   JobInvolvement            1172 non-null   int64  
 8   JobLevel                  1172 non-null   int64  
 9   JobRotation               1172 non-null   int64  
 10  FeedbackFromManager       1172 non-null   object 
 11  Mentoring                 1172 non-null   object 
 12  Autonomy                  1172 non-null   object 
 13  JobRole                   1172 non-null   object 
 14  MonthlyI

In [4]:
unimp_cols = ['WorkLifeBalance', 'JobInvolvement', 'RelationshipSatisfaction', 'Age', 'Gender', 'Education', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'NumCompaniesWorked', 'PercentSalaryHike', 'DistanceFromHome', 'Employee ID']

In [5]:
df = df.drop(columns = unimp_cols)

In [7]:
X = df.drop(columns=['Performance'])
X = pd.get_dummies(X, drop_first=True)

In [8]:
corr = X.corr(numeric_only=True)
pairs = (
    corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        .stack()                                # Series with MultiIndex (feat1, feat2)
        .rename_axis(['feat1', 'feat2'])        # give index level names
        .reset_index(name='corr')               # move index to columns; name the Series as 'corr'
        .assign(abs_corr=lambda d: d['corr'].abs())
        .query('abs_corr >= 0.8')
        .sort_values('abs_corr', ascending=False)
)
print(pairs.head(10))

                          feat1                    feat2      corr  abs_corr
204       Department_Production       JobRole_Machinists  0.881837  0.881837
29                     JobLevel            MonthlyIncome  0.880890  0.880890
161  Department_Human Resources  JobRole_Human Resources  0.878168  0.878168


In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_num = X.select_dtypes(include=[np.number]).dropna(axis=0)
vif = pd.DataFrame({
    'feature': X_num.columns,
    'VIF': [variance_inflation_factor(X_num.values, i) for i in range(X_num.shape[1])]
}).sort_values('VIF', ascending=False)
print(vif.head(10))

                   feature        VIF
1                 JobLevel  21.299089
3            MonthlyIncome  18.164580
4        TotalWorkingYears   6.997190
2              JobRotation   4.334379
0  EnvironmentSatisfaction   4.065041
5     YearsWithCurrManager   3.266625
