In [None]:
# Import the Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the Data Set

df_one = pd.read_csv("predictive_maintenance.csv")
df_two = pd.read_csv("predictive_maintenance.csv")

In [None]:
# Merge the Data Sets (Same Columns)

merged_df = pd.concat([df_one, df_two], ignore_index=True)

In [None]:
""" 
# Merge the Data Sets (Different Columns)

# 1) Inner Join
merged_df = pd.merge(df_one, df_two, on='key_column')

# 2) Left Join
merged_df = pd.merge(df_one, df_two, on='key_column', how='left')

# 3) Right Join
merged_df = pd.merge(df_one, df_two, on='key_column', how='outer')
"""

In [None]:
# Numerical and Categorical Columns

numerical_features = list(train_set._get_numeric_data())

categorical_features = list(train_set.drop(numerical_features, axis = 1))
categorical_features.remove("Input here the target variable.")

print(f"The Numerical Features:\n {numerical_features}\n")
print(f"The Categorical Features:\n {categorical_features}")

In [None]:
# Descriptive Statistics

def description(df):
    desc = pd.DataFrame(index = list(df))
    desc['type'] = df.dtypes
    desc['count'] = df.count()
    desc['nunique'] = df.nunique()
    desc['%unique'] = desc['nunique'] / len(df) * 100
    desc['null'] = df.isnull().sum()
    desc['%null'] = desc['null'] / len(df) * 100
    desc['min'] = df.min()
    desc['max'] = df.max()
    
    return desc

merged_df_description = description(merged_df)

In [None]:
# Train and Test Set Separation

target_variable = "Input here the target variable."
X = merged_df.drop(target_variable, axis=1)
y = merged_df[target_variable]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    stratify=y,
    random_state=42
)

In [None]:
# Imputing Missing Values

# Initialize the imputer
# strategy can be "mean", "median", "most_frequent"
imputer = SimpleImputer(strategy='mean')  # Or another strategy as needed

# Fit the imputer on the training data
imputer.fit(X_train)

# Transform both the training and the test data
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

# If you want to convert the imputed arrays back into DataFrames
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)

In [None]:
# Impute Missing Values For Different Data Types

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Define the imputer for numerical columns
numerical_imputer = SimpleImputer(strategy='mean')

# Define the imputer for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_imputer, numerical_columns),
        ('cat', categorical_imputer, categorical_columns)
    ])

# Assuming df is your DataFrame
X = df  # Your input features

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

# If you want to convert the transformed array back into a DataFrame,
# you'll need to handle the column names and indices manually since
# the output of ColumnTransformer is a numpy array.
columns = numerical_columns + categorical_columns  # Adjust if necessary
X_transformed_df = pd.DataFrame(X_transformed, columns=columns, index=X.index)

####################################################################################################
# Assuming 'other_columns' is a list of the names of the untouched columns
other_columns_data = df[other_columns]

# Concatenate the untouched columns with the transformed DataFrame
final_df = pd.concat([X_transformed_df, other_columns_data], axis=1)
####################################################################################################

In [None]:
# Scaling

# 1) Scale the Whole Data Set
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit on training data
scaler.fit(X_train)

# Transform both training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert arrays back to DataFrames, if necessary, to retain column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# 2) Scale Only Continuous Features
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Assuming 'continuous_features' is a list of the names of the continuous columns
continuous_features = ['your', 'continuous', 'feature', 'names', 'here']

# Initialize the ColumnTransformer to scale only continuous features
ct = ColumnTransformer([
    ("scale", StandardScaler(), continuous_features)
], remainder='passthrough')  # 'passthrough' means other columns will not be transformed

# Fit on training data
ct.fit(X_train)

# Transform both training and test data
X_train_scaled = ct.transform(X_train)
X_test_scaled = ct.transform(X_test)

# If you need the result as a DataFrame, you'll need to handle column ordering manually:
# The transformed DataFrame will have scaled continuous features first, followed by untouched features
scaled_columns = continuous_features  # Scaled
unscaled_columns = [col for col in X_train.columns if col not in continuous_features]  # Not scaled
all_columns = scaled_columns + unscaled_columns  # Combined column order

X_train_scaled = pd.DataFrame(X_train_scaled, columns=all_columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=all_columns, index=X_test.index)

In [None]:
# Encoding

# 1) Applying OneHotEncoder to Categorical Features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Assuming categorical_features is your list of categorical column names
categorical_features = ['your', 'categorical', 'feature', 'names', 'here']

# Initialize the OneHotEncoder and ColumnTransformer
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')  # sparse=False means output will be a dense matrix
column_transformer = ColumnTransformer(transformers=[
    ('cat', onehot_encoder, categorical_features)
], remainder='passthrough')

# Fit on the training data and transform both training and test sets
X_train_encoded = column_transformer.fit_transform(X_train)
X_test_encoded = column_transformer.transform(X_test)

# 2) Applying LabelEncoder to Categorical Features
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Assuming the column name you want to encode is "gender"
# Apply LabelEncoder to the "gender" column of the training set
X_train['gender_encoded'] = label_encoder.fit_transform(X_train['gender'])

# Apply the same encoder to the test set to ensure consistency
X_test['gender_encoded'] = label_encoder.transform(X_test['gender'])

# 3) Combining OneHotEncoder and LabelEncoder for Individual Columns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# OneHotEncoder setup for "gender" column
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
gender_encoded = onehot_encoder.fit_transform(X_train[['gender']])

# Assuming there is another categorical feature for LabelEncoder
label_encoder = LabelEncoder()
other_feature_encoded = label_encoder.fit_transform(X_train['other_feature'])

# For OneHotEncoder, you need to convert the output back to a DataFrame if you want to concatenate it back
gender_encoded_df = pd.DataFrame(gender_encoded, columns=onehot_encoder.get_feature_names_out(['gender']), index=X_train.index)

# For LabelEncoder, you can directly assign the encoded column back to the DataFrame
X_train['other_feature_encoded'] = other_feature_encoded

# Concatenate the OneHotEncoded column back to the original DataFrame (or a copy of it)
X_train_processed = pd.concat([X_train, gender_encoded_df], axis=1)

# Repeat similar steps for X_test, using transform() instead of fit_transform()

In [None]:
# Base Models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score


# 1) Logistic Regression
# Initialize and fit the model
lr_model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]  # For AUC calculation

# Evaluate performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_pred_proba_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print(f"Logistic Regression - Accuracy: {accuracy_lr}, AUC: {auc_lr}, F1 Score: {f1_lr}")

# 2) XGBoost Classifier
# Initialize and fit the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_proba_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

print(f"XGBoost Classifier - Accuracy: {accuracy_xgb}, AUC: {auc_xgb}, F1 Score: {f1_xgb}")

# 3) K-Nearest Neighbors (KNN) Classifier
# Initialize and fit the model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn_model.predict(X_test)
y_pred_proba_knn = knn_model.predict_proba(X_test)[:, 1]

# Evaluate performance
accuracy_knn = accuracy_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_pred_proba_knn)
f1_knn = f1_score(y_test, y_pred_knn)

print(f"KNN - Accuracy: {accuracy_knn}, AUC: {auc_knn}, F1 Score: {f1_knn}")

# 4) Random Forest Classifier
# Initialize and fit the model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf}, AUC: {auc_rf}, F1 Score: {f1_rf}")

In [None]:
# Grid Search

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# Logistic Regression
from sklearn.linear_model import LogisticRegression

grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

gs_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=grid_lr,
    scoring='f1',
    cv=3,
    verbose=2
)

gs_lr.fit(X_train, y_train)

best_params_lr = gs_lr.best_params_
print("Best parameters for Logistic Regression:", best_params_lr)

# Evaluate the best model
best_lr = LogisticRegression(**best_params_lr)
best_lr.fit(X_train, y_train)
pred_lr = best_lr.predict(X_test)
pred_proba_lr = best_lr.predict_proba(X_test)[:, 1]

accuracy_lr = accuracy_score(y_test, pred_lr)
auc_lr = roc_auc_score(y_test, pred_proba_lr)
f1_lr = f1_score(y_test, pred_lr)

print(f"Logistic Regression - Accuracy: {accuracy_lr}, AUC: {auc_lr}, F1 Score: {f1_lr}")

# XGBoost Classifier
from xgboost import XGBClassifier

grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gs_xgb = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_grid=grid_xgb,
    scoring='f1',
    cv=3,
    verbose=2
)

gs_xgb.fit(X_train, y_train)

best_params_xgb = gs_xgb.best_params_
print("Best parameters for XGBoost Classifier:", best_params_xgb)

# Evaluate the best model
best_xgb = XGBClassifier(**best_params_xgb)
best_xgb.fit(X_train, y_train)
pred_xgb = best_xgb.predict(X_test)
pred_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]

accuracy_xgb = accuracy_score(y_test, pred_xgb)
auc_xgb = roc_auc_score(y_test, pred_proba_xgb)
f1_xgb = f1_score(y_test, pred_xgb)

print(f"XGBoost Classifier - Accuracy: {accuracy_xgb}, AUC: {auc_xgb}, F1 Score: {f1_xgb}")

# KNN Classifier
from sklearn.ensemble import RandomForestClassifier

grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

gs_rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=grid_rf,
    scoring='f1',
    cv=3,
    verbose=2
)

gs_rf.fit(X_train, y_train)

best_params_rf = gs_rf.best_params_
print("Best parameters for Random Forest:", best_params_rf)

# Evaluate the best model
best_rf = RandomForestClassifier(**best_params_rf)
best_rf.fit(X_train, y_train)
pred_rf = best_rf.predict(X_test)
pred_proba_rf = best_rf.predict_proba(X_test)[:, 1]

accuracy_rf = accuracy_score(y_test, pred_rf)
auc_rf = roc_auc_score(y_test, pred_proba_rf)
f1_rf = f1_score(y_test, pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf}, AUC: {auc_rf}, F1 Score: {f1_rf}")

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

gs_rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=grid_rf,
    scoring='f1',
    cv=3,
    verbose=2
)

gs_rf.fit(X_train, y_train)

best_params_rf = gs_rf.best_params_
print("Best parameters for Random Forest:", best_params_rf)

# Evaluate the best model
best_rf = RandomForestClassifier(**best_params_rf)
best_rf.fit(X_train, y_train)
pred_rf = best_rf.predict(X_test)
pred_proba_rf = best_rf.predict_proba(X_test)[:, 1]

accuracy_rf = accuracy_score(y_test, pred_rf)
auc_rf = roc_auc_score(y_test, pred_proba_rf)
f1_rf = f1_score(y_test, pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf}, AUC: {auc_rf}, F1 Score: {f1_rf}")

# Regression

In [None]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and fit the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - MSE: {mse_lr}, R2: {r2_lr}")

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

########################################################################################################################

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for alpha
grid_ridge = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Initialize the GridSearchCV object
gs_ridge = GridSearchCV(
    estimator=Ridge(),
    param_grid=grid_ridge,
    scoring='neg_mean_squared_error',  # You might choose a different scorer depending on your objective
    cv=3,
    verbose=2
)

# Fit the GridSearchCV object to the training data
gs_ridge.fit(X_train, y_train)

# Find the best parameters and evaluate on the test set
best_params_ridge = gs_ridge.best_params_
print("Best parameters for Ridge Regression:", best_params_ridge)

best_ridge = Ridge(**best_params_ridge)
best_ridge.fit(X_train, y_train)
y_pred_ridge = best_ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression - MSE: {mse_ridge}, R2: {r2_ridge}")

In [None]:
# XGBoost Regressor
grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gs_xgb = GridSearchCV(
    estimator=XGBRegressor(eval_metric='rmse'),
    param_grid=grid_xgb,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2
)

In [None]:
# KNeighborsRegressor
grid_knn = {
    'n_neighbors': [5, 10, 15],
    'metric': ['euclidean', 'manhattan']
}

gs_knn = GridSearchCV(
    estimator=KNeighborsRegressor(),
    param_grid=grid_knn,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2
)

In [None]:
# RandomForestRegressor
grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

gs_rf = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=grid_rf,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2
)

In [None]:
# Evaluation
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Assuming `best_model` is your trained regression model and X_test, y_test are your test datasets
predictions = best_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"Mean Squared Error: {mse}, R2 Score: {r2}, Mean Absolute Error: {mae}")