# **The Problem & Business Importance**





# **Data Identification & Analysis**

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import joblib

from KNN_D import KNN_D

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV,train_test_split , KFold
from sklearn.linear_model import LogisticRegression , RidgeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'

In [None]:
df = pd.read_csv('i4talent_dataset.csv')
df['datum'] = pd.to_datetime(df['datum'])
df['geboortedatum'] = pd.to_datetime(df['geboortedatum'])
df['indiensttreding_datum'] = pd.to_datetime(df['indiensttreding_datum'])


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df.isna().sum())

In [None]:
df['leeftijd'] = df['leeftijd'].fillna((df['datum'] - df['geboortedatum']).dt.days // 365)
df['lengte_dienst'] = df['lengte_dienst'].fillna((df['datum'] - df['indiensttreding_datum']).dt.days // 365)
df['stad'] = df['stad'].fillna(df['stad'].mode())
df['afdeling'] = df['afdeling'].fillna(df['afdeling'].mode())

print(df.isna().sum())

In [None]:
columns_numerical_with_nas = ['leeftijd', 'lengte_dienst']
for col in columns_numerical_with_nas:
    plt.hist(df[col], bins=100)
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
columns_to_delete = ['geboortedatum', 'WerknemerID', 'uitdiensttreding_datum', 'indiensttreding_datum', 'geslacht', 'uitdiensttreding_type', 'datum', 'STATUS_JAAR', 'uitdiensttreding_reden']
df_new = df.drop(columns=columns_to_delete)


In [None]:
df_with_dummies = pd.get_dummies(df_new, columns=['stad', 'afdeling', 'geslacht_id', 'STATUS', 'BUSINESS_UNIT'], drop_first=True , dtype = int)
df_with_dummies.columns

geboortedatum, WerknemerID, uitdiensttreding_datum, indiensttreding_datum, geslacht, uitdiensttreding_type

Dummies: 'stad', 'afdeling', 'geslachtID', 'uitdiensttreding_reden', 'Status', 'BUSINESS_UNIT'

In [None]:
#Used later for model evaluation
model_scores = {}

#You can download the pretrained models here : https://github.com/dvanaanhout/GROUP5
#Saves time running the file.

In [None]:
X = df_with_dummies.drop(columns=['STATUS_Beëindigd'])
y = df_with_dummies['STATUS_Beëindigd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib

# Load or Train Logistic Regression Model
saved_model_name = 'model_lr.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_lr = loaded_model
else:
    model_lr = LogisticRegression()
    model_lr.fit(X_train, y_train)
    joblib.dump(model_lr, saved_model_name)

# Predictions and Model Scores
y_pred = model_lr.predict(X_test)

train_score = model_lr.score(X_train, y_train)
test_score = model_lr.score(X_test, y_test)

model_scores['Logistic Regression'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print(classification_report(y_test, y_pred))

# Feature Importance
coefficients = model_lr.coef_[0]

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
})

feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Logistic Regression)')
plt.tight_layout()
plt.show()

# Multicollinearity Check
# Add a constant to X_train to calculate VIF
X_train_vif = pd.DataFrame(X_train, columns=X_train.columns)
X_train_vif['Intercept'] = 1

vif_data = pd.DataFrame({
    'Feature': X_train_vif.columns,
    'VIF': [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]
})

# Remove intercept from VIF output
vif_data = vif_data[vif_data['Feature'] != 'Intercept']

# Display VIF DataFrame
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

# Optional: Plot VIF for visualization
plt.figure(figsize=(10, 6))
plt.barh(vif_data['Feature'], vif_data['VIF'])
plt.xlabel('VIF')
plt.title('Variance Inflation Factor (Multicollinearity)')
plt.tight_layout()
plt.show()

In [None]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Define the saved model name and parameter grid
saved_model_name = 'model_lr_TUNED.joblib'

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
}

# Train or load the tuned logistic regression model
if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_lr_TUNED = loaded_model
else:
    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    model_lr_TUNED = grid_search.best_estimator_
    joblib.dump(model_lr_TUNED, saved_model_name)

# Model predictions and evaluation
y_pred = model_lr_TUNED.predict(X_test)

train_score = model_lr_TUNED.score(X_train, y_train)
test_score = model_lr_TUNED.score(X_test, y_test)

model_scores['Logistic Regression TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print(classification_report(y_test, y_pred))

# Feature importance from logistic regression coefficients
coefficients = model_lr_TUNED.coef_[0]

# Create a DataFrame to hold feature importance data
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
})

# Sort the features by their importance (absolute value of coefficients)
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Tuned Logistic Regression Model')
plt.tight_layout()
plt.show()

# Optionally print the feature importance table
print("\nFeature Importance Table:")
print(feature_importance)

# Multicollinearity Analysis: Variance Inflation Factor (VIF)
# Add a constant to the training set for VIF computation
X_train_vif = pd.DataFrame(X_train, columns=X_train.columns)
X_train_vif['Intercept'] = 1

# Compute VIF for each feature
vif_data = pd.DataFrame({
    'Feature': X_train_vif.columns,
    'VIF': [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]
})

# Remove intercept from VIF results
vif_data = vif_data[vif_data['Feature'] != 'Intercept']

# Display VIF DataFrame
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

# Plot VIF for visualization
plt.figure(figsize=(10, 6))
plt.barh(vif_data['Feature'], vif_data['VIF'])
plt.xlabel('VIF')
plt.title('Variance Inflation Factor (Multicollinearity)')
plt.tight_layout()
plt.show()

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate Variance Inflation Factor (VIF) to detect multicollinearity
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 2: Preprocessing and scaling
scaler = StandardScaler()

# Standardizing the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Handle Multicollinearity
print("Calculating VIF for initial features...")
vif_data = calculate_vif(pd.DataFrame(X_train_scaled, columns=X_train.columns))
print("VIF data before removing collinearity:")
print(vif_data)

# Dropping features with VIF > 5 (multicollinearity threshold)
vif_threshold = 5
high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"]
print(f"Features to drop due to multicollinearity (VIF > {vif_threshold}): {list(high_vif_features)}")

# Remove high VIF features from both training and test sets
X_train_reduced = X_train.drop(columns=high_vif_features)
X_test_reduced = X_test.drop(columns=high_vif_features)

# Step 4: Scaling the reduced feature set
X_train_scaled_reduced = scaler.fit_transform(X_train_reduced)
X_test_scaled_reduced = scaler.transform(X_test_reduced)

# Step 5: Define and train the HistGradientBoostingClassifier model
saved_model_name = 'model_HGBC_reduced.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_hgb = loaded_model
else:
    model_hgb = HistGradientBoostingClassifier()
    model_hgb.fit(X_train_scaled_reduced, y_train)
    joblib.dump(model_hgb, saved_model_name)

# Step 6: Evaluate the model
y_pred = model_hgb.predict(X_test_scaled_reduced)
train_score = model_hgb.score(X_train_scaled_reduced, y_train)
test_score = model_hgb.score(X_test_scaled_reduced, y_test)

model_scores = {}
model_scores['Hist Gradient Boosting (Reduced Features)'] = {
    'Train Score': train_score,
    'Test Score': test_score,
}

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print(classification_report(y_test, y_pred))

# Step 7: Visualize feature importance
# Extract feature importance from the model
feature_importance = pd.DataFrame({
    'Feature': X_train_reduced.columns,
    'Importance': model_hgb.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Hist Gradient Boosting Classifier (Reduced Features)')
plt.show()

print("Feature Importance:")
print(feature_importance)

# Optional: You can display the VIF data before and after removing high VIF features
print("VIF data after removing collinearity:")
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
print(vif_data_reduced)

# Step 8: Plot VIF graph
plt.figure(figsize=(10, 6))
plt.barh(vif_data["Feature"], vif_data["VIF"], color='blue', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (Before Removing Multicollinearity)')
plt.legend()
plt.show()

# Plot VIF graph after feature removal
plt.figure(figsize=(10, 6))
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
plt.barh(vif_data_reduced["Feature"], vif_data_reduced["VIF"], color='green', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (After Removing Multicollinearity)')
plt.legend()
plt.show()

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate Variance Inflation Factor (VIF) to detect multicollinearity
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Model saving/loading
saved_model_name = 'model_HGBC_TUNED.joblib'

if os.path.exists(saved_model_name):
    model_HGBC_TUNED = joblib.load(saved_model_name)
else:
    model_HGBC_TUNED = HistGradientBoostingClassifier()
    param_grid = {
        'max_iter': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9, 21, 25],
        'min_samples_leaf': [1, 5, 10, 15, 20]
    }

    grid_search = GridSearchCV(estimator=model_HGBC_TUNED, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model_HGBC_TUNED = grid_search.best_estimator_
    joblib.dump(best_model_HGBC_TUNED, saved_model_name)
    print(f"Best Parameters: {grid_search.best_params_}")
    model_HGBC_TUNED = best_model_HGBC_TUNED

# Predictions and evaluation
y_pred = model_HGBC_TUNED.predict(X_test)

train_score = model_HGBC_TUNED.score(X_train, y_train)
test_score = model_HGBC_TUNED.score(X_test, y_test)

model_scores = {}
model_scores['Hist Gradient Boosting TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
}

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print(classification_report(y_test, y_pred))

# Step 2: Calculate VIF before removing high collinearity features
print("Calculating VIF for initial features...")
vif_data = calculate_vif(X_train)
print("VIF data before removing collinearity:")
print(vif_data)

# Optional: You can drop features with high VIF (greater than a threshold, e.g., VIF > 5)
vif_threshold = 5
high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"]
print(f"Features to drop due to multicollinearity (VIF > {vif_threshold}): {list(high_vif_features)}")

# Remove high VIF features from both training and test sets
X_train_reduced = X_train.drop(columns=high_vif_features)
X_test_reduced = X_test.drop(columns=high_vif_features)

# Step 3: Feature Importance Extraction (with reduced features if applicable)
feature_importance = pd.DataFrame({
    'Feature': X_train_reduced.columns if high_vif_features.any() else X_train.columns,
    'Importance': model_HGBC_TUNED.feature_importances_  # Feature importances from the tuned model
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Step 4: Plot Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (HistGradientBoostingClassifier TUNED)')
plt.show()

# Optional: VIF after feature reduction
print("VIF data after removing collinearity:")
vif_data_reduced = calculate_vif(X_train_reduced)
print(vif_data_reduced)

# Step 5: Plot VIF Graph
plt.figure(figsize=(10, 6))
plt.barh(vif_data["Feature"], vif_data["VIF"], color='blue', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (Before Removing Multicollinearity)')
plt.legend()
plt.show()

# Plot VIF graph after feature removal
plt.figure(figsize=(10, 6))
plt.barh(vif_data_reduced["Feature"], vif_data_reduced["VIF"], color='green', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (After Removing Multicollinearity)')
plt.legend()
plt.show()

In [None]:
saved_model_name = 'model_KNN.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_knn = loaded_model
else:
    model_knn = KNeighborsClassifier()
    model_knn.fit(X_train, y_train)
    joblib.dump(model_knn, saved_model_name)

y_pred = model_knn.predict(X_test)

train_score = model_knn.score(X_train, y_train)
test_score = model_knn.score(X_test, y_test)

params = model_knn.get_params()

model_scores['KNN'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print(classification_report(y_test, y_pred))

In [None]:

saved_model_name = 'model_KNN_TUNED.joblib'

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11]
}

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_knn_TUNED = loaded_model
else:
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    model_knn_TUNED = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    joblib.dump(model_knn_TUNED, saved_model_name)

y_pred = model_knn_TUNED.predict(X_test)

train_score = model_knn_TUNED.score(X_train, y_train)
test_score = model_knn_TUNED.score(X_test, y_test)

params = model_knn_TUNED.get_params()

model_scores['KNN TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

print(classification_report(y_test, y_pred))

In [None]:
model_KNN_D = KNN_D()
model_KNN_D.fit(X_train, y_train)


y_train_pred = model_KNN_D.predict(X_train)
y_test_pred = model_KNN_D.predict(X_test)


train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_test_pred)

model_scores['KNN_D'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : 'dcalc = Euclidean , NN = 5'
}

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
saved_model_name = 'model_XGBoost.joblib'


if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_xgboost = loaded_model
else:
    model_xgboost = XGBClassifier()
    model_xgboost.fit(X_train, y_train)
    print(f"Model Parameters: {model_xgboost.get_params()}")
    joblib.dump(model_xgboost, saved_model_name)

y_pred = model_xgboost.predict(X_test)

train_score = model_xgboost.score(X_train, y_train)
test_score = model_xgboost.score(X_test, y_test)

params = model_xgboost.get_params()

model_scores['XGBoost Classifier'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

print(classification_report(y_test, y_pred))

In [None]:
saved_model_name = 'model_XGBoost_TUNED.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_xgboost_TUNED = loaded_model
else:
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    model_xgboost_TUNED = grid_search.best_estimator_
    joblib.dump(model_xgboost_TUNED, saved_model_name)

y_pred = model_xgboost_TUNED.predict(X_test)

train_score = model_xgboost_TUNED.score(X_train, y_train)
test_score = model_xgboost_TUNED.score(X_test, y_test)

params = model_xgboost_TUNED.get_params()

model_scores['XGBoost Classifier TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

print(classification_report(y_test, y_pred))

In [None]:
import os
import joblib
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test)        # Only transform test data

# Define saved model name
saved_model_name = 'model_ridge.joblib'

# Train or load the RidgeClassifier
if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_ridge = loaded_model
else:
    model_ridge = RidgeClassifier()
    model_ridge.fit(X_train_scaled, y_train)
    joblib.dump(model_ridge, saved_model_name)

# Predictions and evaluation
y_pred = model_ridge.predict(X_test_scaled)

train_score = model_ridge.score(X_train_scaled, y_train)
test_score = model_ridge.score(X_test_scaled, y_test)

model_scores['Ridge Classifier'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importance from RidgeClassifier coefficients
coefficients = model_ridge.coef_[0]

# Create a DataFrame to hold feature importance data
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
})

# Sort features by importance (absolute value of coefficients)
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Ridge Classifier')
plt.tight_layout()
plt.show()

# Optionally print the feature importance table
print("\nFeature Importance Table:")
print(feature_importance)

# Multicollinearity Analysis: Variance Inflation Factor (VIF)
# Add a constant for VIF computation
X_train_vif = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_vif['Intercept'] = 1

# Compute VIF for each feature
vif_data = pd.DataFrame({
    'Feature': X_train_vif.columns,
    'VIF': [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]
})

# Remove the intercept from VIF results
vif_data = vif_data[vif_data['Feature'] != 'Intercept']

# Display VIF DataFrame
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

# Plot VIF for visualization
plt.figure(figsize=(10, 6))
plt.barh(vif_data['Feature'], vif_data['VIF'])
plt.xlabel('VIF')
plt.title('Variance Inflation Factor (Multicollinearity)')
plt.tight_layout()
plt.show()

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV

# Scaling the data before fitting the model
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Only transform the test data
X_test_scaled = scaler.transform(X_test)

# Save/load tuned RidgeClassifier
saved_model_name = 'model_ridge_TUNED.joblib'

if os.path.exists(saved_model_name):
    model_ridge_TUNED = joblib.load(saved_model_name)
else:
    param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
    }
    ridge = RidgeClassifier()
    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)
    
    model_ridge_TUNED = grid_search.best_estimator_
    joblib.dump(model_ridge_TUNED, saved_model_name)

# Evaluate the model
y_pred = model_ridge_TUNED.predict(X_test_scaled)

train_score = model_ridge_TUNED.score(X_train_scaled, y_train)
test_score = model_ridge_TUNED.score(X_test_scaled, y_test)

model_scores['Ridge Classifier (Tuned)'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importance analysis
coefficients = model_ridge_TUNED.coef_[0]

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Ridge Classifier (Tuned)')
plt.tight_layout()
plt.show()

print("\nFeature Importance Table:")
print(feature_importance)

# Multicollinearity Analysis: Variance Inflation Factor (VIF)
# Prepare data for VIF computation
X_train_vif = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_vif['Intercept'] = 1

# Compute VIF for each feature
vif_data = pd.DataFrame({
    'Feature': X_train_vif.columns,
    'VIF': [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]
})

# Remove intercept from VIF results
vif_data = vif_data[vif_data['Feature'] != 'Intercept']

# Display and plot VIF
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

plt.figure(figsize=(10, 6))
plt.barh(vif_data['Feature'], vif_data['VIF'])
plt.xlabel('VIF')
plt.title('Variance Inflation Factor (Multicollinearity)')
plt.tight_layout()
plt.show()

# Optional: Remove features with high VIF and refit the model
high_vif_features = vif_data[vif_data['VIF'] > 10]['Feature']
if not high_vif_features.empty:
    print("\nFeatures with high multicollinearity (VIF > 10):")
    print(high_vif_features.tolist())

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import os
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

# PREPROCESSING
# Standardizing the features
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# MULTICOLLINEARITY DETECTION
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train.columns  # Assuming X_train is a DataFrame
vif_data["VIF"] = [variance_inflation_factor(X_train_scaled, i) for i in range(X_train_scaled.shape[1])]

# Print VIF values
print("Variance Inflation Factor (VIF) for features:")
print(vif_data)

# Optionally drop features with high VIF (threshold depends on your tolerance, typically > 10 indicates severe multicollinearity)
high_vif_features = vif_data[vif_data["VIF"] > 10]["Feature"].tolist()
print(f"Features with high VIF: {high_vif_features}")

# Drop high VIF features (optional)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns).drop(columns=high_vif_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns).drop(columns=high_vif_features)

# TRAINING
saved_model_name = 'model_lasso_classifier.joblib'

# Load the model if it exists, otherwise fit a new one
if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_lasso_classifier = loaded_model
else:
    model_lasso_classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
    model_lasso_classifier.fit(X_train_scaled, y_train)
    joblib.dump(model_lasso_classifier, saved_model_name)

# EVALUATION
y_pred = model_lasso_classifier.predict(X_test_scaled)

train_score = model_lasso_classifier.score(X_train_scaled, y_train)
test_score = model_lasso_classifier.score(X_test_scaled, y_test)

# Store model scores
model_scores['Lasso Classifier'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# FEATURE IMPORTANCE
# Extracting coefficients (importance)
coefficients = model_lasso_classifier.coef_[0]  # Coefficients for binary classification (only one array)

# Creating a DataFrame to show features and their corresponding importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,  # Updated X_train after removing high VIF features
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)  # Using absolute value of coefficients for importance
})

# Sorting by importance (largest to smallest)
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plotting feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Lasso Classifier')
plt.show()

# Optionally, display the feature importance table
print(feature_importance)

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate Variance Inflation Factor (VIF) to detect multicollinearity
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 2: Preprocessing and scaling
scaler = StandardScaler()

# Standardizing the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Handle Multicollinearity
print("Calculating VIF for initial features...")
vif_data = calculate_vif(pd.DataFrame(X_train_scaled, columns=X_train.columns))
print("VIF data before removing collinearity:")
print(vif_data)

# Dropping features with VIF > 5 (multicollinearity threshold)
vif_threshold = 5
high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"]
print(f"Features to drop due to multicollinearity (VIF > {vif_threshold}): {list(high_vif_features)}")

# Remove high VIF features from both training and test sets
X_train_reduced = X_train.drop(columns=high_vif_features)
X_test_reduced = X_test.drop(columns=high_vif_features)

# Step 4: Scaling the reduced feature set
X_train_scaled_reduced = scaler.fit_transform(X_train_reduced)
X_test_scaled_reduced = scaler.transform(X_test_reduced)

# Step 5: Define and train the Lasso Logistic Regression model
saved_model_name = 'model_lasso_classifier_TUNED_reduced.joblib'

if os.path.exists(saved_model_name):
    model_lasso_classifier = joblib.load(saved_model_name)
else:
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'max_iter': [1000, 2000]
    }
    base_model = LogisticRegression(penalty='l1', solver='saga')
    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled_reduced, y_train)
    model_lasso_classifier = grid_search.best_estimator_
    joblib.dump(model_lasso_classifier, saved_model_name)

# Step 6: Evaluate the model
y_pred = model_lasso_classifier.predict(X_test_scaled_reduced)
train_score = model_lasso_classifier.score(X_train_scaled_reduced, y_train)
test_score = model_lasso_classifier.score(X_test_scaled_reduced, y_test)

model_scores = {}
model_scores['Lasso Classifier (Tuned, Reduced Features)'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Visualize feature importance
coefficients = model_lasso_classifier.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X_train_reduced.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Tuned Lasso Classifier (Reduced Features)')
plt.show()

print("Feature Importance:")
print(feature_importance)

# Optional: You can display the VIF data before and after removing high VIF features
print("VIF data after removing collinearity:")
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
print(vif_data_reduced)

In [None]:
saved_model_name = 'model_DTC.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_DTC = loaded_model
else:
    model_DTC = DecisionTreeClassifier(random_state=42)
    model_DTC.fit(X_train, y_train)
    joblib.dump(model_DTC, saved_model_name)

y_pred = model_DTC.predict(X_test)

train_score = model_DTC.score(X_train, y_train)
test_score = model_DTC.score(X_test, y_test)

params = model_DTC.get_params()

model_scores['Decision Tree Classifier'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
saved_model_name = 'model_DTC_TUNED.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_DT_TUNED = loaded_model
else:
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    model_DT_TUNED = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model_DT_TUNED, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    model_DT_TUNED = grid_search.best_estimator_
    joblib.dump(model_DT_TUNED, saved_model_name)

y_pred = model_DT_TUNED.predict(X_test)

train_score = model_DT_TUNED.score(X_train, y_train)
test_score = model_DT_TUNED.score(X_test, y_test)

params = model_DT_TUNED.get_params()

model_scores['Decision Tree Classifier TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
saved_model_name = 'model_RandomForestClassifier.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_RF = loaded_model
else:
    model_RF = RandomForestClassifier(random_state=42)
    model_RF.fit(X_train, y_train)
    joblib.dump(model_RF, saved_model_name)

y_pred = model_RF.predict(X_test)

train_score = model_RF.score(X_train, y_train)
test_score = model_RF.score(X_test, y_test)

params = model_RF.get_params()

model_scores['Random Forest Classifier'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)



In [None]:
importances = model_RF.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
saved_model_name = 'model_RandomForest_TUNED.joblib'

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_RF_TUNED = loaded_model
else:
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    model_RF_TUNED = grid_search.best_estimator_
    joblib.dump(model_RF_TUNED, saved_model_name)

y_pred = model_RF_TUNED.predict(X_test)

train_score = model_RF_TUNED.score(X_train, y_train)
test_score = model_RF_TUNED.score(X_test, y_test)

params = model_RF_TUNED.get_params()

model_scores['Random Forest Classifier TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Used parameters' : params
}

conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)



In [None]:
importances = model_RF_TUNED.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate Variance Inflation Factor (VIF) to detect multicollinearity
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 2: Preprocessing and scaling
scaler = StandardScaler()

# Standardizing the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Handle Multicollinearity
print("Calculating VIF for initial features...")
vif_data = calculate_vif(pd.DataFrame(X_train_scaled, columns=X_train.columns))
print("VIF data before removing collinearity:")
print(vif_data)

# Dropping features with VIF > 5 (multicollinearity threshold)
vif_threshold = 5
high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"]
print(f"Features to drop due to multicollinearity (VIF > {vif_threshold}): {list(high_vif_features)}")

# Remove high VIF features from both training and test sets
X_train_reduced = X_train.drop(columns=high_vif_features)
X_test_reduced = X_test.drop(columns=high_vif_features)

# Step 4: Scaling the reduced feature set
X_train_scaled_reduced = scaler.fit_transform(X_train_reduced)
X_test_scaled_reduced = scaler.transform(X_test_reduced)

# Step 5: Define and train the SVM model
saved_model_name = 'model_svm_classifier_DEFAULT_reduced.joblib'

if os.path.exists(saved_model_name):
    model_svm_classifier = joblib.load(saved_model_name)
else:
    model_svm_classifier = SVC()  # Default parameters
    model_svm_classifier.fit(X_train_scaled_reduced, y_train)
    joblib.dump(model_svm_classifier, saved_model_name)

# Step 6: Evaluate the model
y_pred = model_svm_classifier.predict(X_test_scaled_reduced)
train_score = model_svm_classifier.score(X_train_scaled_reduced, y_train)
test_score = model_svm_classifier.score(X_test_scaled_reduced, y_test)

model_scores = {}
model_scores['SVM Classifier (Reduced Features)'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Visualize feature importance (using coefficients)
# In SVM, coefficients represent feature importance
coefficients = model_svm_classifier.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X_train_reduced.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for SVM Classifier (Reduced Features)')
plt.show()

print("Feature Importance:")
print(feature_importance)

# Optional: You can display the VIF data before and after removing high VIF features
print("VIF data after removing collinearity:")
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
print(vif_data_reduced)

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate Variance Inflation Factor (VIF) to detect multicollinearity
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 2: Preprocessing and scaling
scaler = StandardScaler()

# Standardizing the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Handle Multicollinearity
print("Calculating VIF for initial features...")
vif_data = calculate_vif(pd.DataFrame(X_train_scaled, columns=X_train.columns))
print("VIF data before removing collinearity:")
print(vif_data)

# Dropping features with VIF > 5 (multicollinearity threshold)
vif_threshold = 5
high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"]
print(f"Features to drop due to multicollinearity (VIF > {vif_threshold}): {list(high_vif_features)}")

# Remove high VIF features from both training and test sets
X_train_reduced = X_train.drop(columns=high_vif_features)
X_test_reduced = X_test.drop(columns=high_vif_features)

# Step 4: Scaling the reduced feature set
X_train_scaled_reduced = scaler.fit_transform(X_train_reduced)
X_test_scaled_reduced = scaler.transform(X_test_reduced)

# Step 5: Define and train the SVM model
saved_model_name = 'model_svm_classifier_DEFAULT_reduced.joblib'

if os.path.exists(saved_model_name):
    model_svm_classifier = joblib.load(saved_model_name)
else:
    model_svm_classifier = SVC()  # Default parameters
    model_svm_classifier.fit(X_train_scaled_reduced, y_train)
    joblib.dump(model_svm_classifier, saved_model_name)

# Step 6: Evaluate the model
y_pred = model_svm_classifier.predict(X_test_scaled_reduced)
train_score = model_svm_classifier.score(X_train_scaled_reduced, y_train)
test_score = model_svm_classifier.score(X_test_scaled_reduced, y_test)

model_scores = {}
model_scores['SVM Classifier (Reduced Features)'] = {
    'Train Score': train_score,
    'Test Score': test_score
}

conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Visualize feature importance (using coefficients)
# In SVM, coefficients represent feature importance
coefficients = model_svm_classifier.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X_train_reduced.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for SVM Classifier (Reduced Features)')
plt.show()

print("Feature Importance:")
print(feature_importance)

# Optional: You can display the VIF data before and after removing high VIF features
print("VIF data after removing collinearity:")
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
print(vif_data_reduced)

# Step 8: Plot VIF graph
plt.figure(figsize=(10, 6))
plt.barh(vif_data["Feature"], vif_data["VIF"], color='blue', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (Before Removing Multicollinearity)')
plt.legend()
plt.show()

# Plot VIF graph after feature removal
plt.figure(figsize=(10, 6))
vif_data_reduced = calculate_vif(pd.DataFrame(X_train_scaled_reduced, columns=X_train_reduced.columns))
plt.barh(vif_data_reduced["Feature"], vif_data_reduced["VIF"], color='green', alpha=0.7)
plt.axvline(x=vif_threshold, color='red', linestyle='--', label=f'VIF Threshold: {vif_threshold}')
plt.xlabel('VIF')
plt.title('VIF of Features (After Removing Multicollinearity)')
plt.legend()
plt.show()

In [None]:
models = list(model_scores.keys())
train_scores = [model_scores[model]["Train Score"] for model in models]
test_scores = [model_scores[model]["Test Score"] for model in models]

x = range(len(models))

plt.figure(figsize=(10, 6))
plt.bar(x, train_scores, width=0.4, label='Train Score', color='b', align='center')
plt.bar([p + 0.4 for p in x], test_scores, width=0.4, label='Test Score', color='orange', align='center')

plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Train and Test Scores of Different Models')
plt.xticks([p + 0.2 for p in x], models , rotation =90)
plt.ylim(0.9, 1)
plt.legend()
plt.show()


In [None]:
for i in model_scores:
        print(i)
        print(f'Used parameters: {model_scores[i]["Used parameters"]}')

Find at employees at risk of leaving.

In [None]:
X = df_with_dummies.drop('STATUS_Beëindigd', axis=1)
y = df_with_dummies['STATUS_Beëindigd']

model = RandomForestClassifier()
kf = KFold(n_splits=10)

predicted_vals = []

for i, j in kf.split(X):
    X_train, X_test = X.iloc[i], X.iloc[j]
    y_train, y_test = y.iloc[i], y.iloc[j]
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predicted_vals.extend(predictions)

df_n_dummies = pd.get_dummies(df , columns=['STATUS'] , drop_first=True)
df['pred_STATUS_Beëindigd'] = predicted_vals

at_risk_employees = df_n_dummies[(df_n_dummies['pred_STATUS_Beëindigd'] == True) & (df_n_dummies['STATUS_Beëindigd'] == False)]
at_risk_employees
