In [1]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
print(tf.__version__)

2024-08-02 13:17:01.875499: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the dataset
data = pd.read_csv('final_data.csv')  # Replace with your dataset path

# Preprocess the data
X = data.drop(data.columns[[0, 1, 2]], axis=1)
y = data.iloc[:, 2]

# Replace infinity values with NaNs
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Support Vector Regression': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'eXtreme Gradient Boosting': xgb.XGBRegressor(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'ElasticNet Regression': ElasticNet(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Neural Network Regression': Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1)])
    }

# Compile the Neural Network model
models['Neural Network Regression'].compile(optimizer='adam', loss='mean_squared_error')

# Define the threshold
threshold = 10

# Train and evaluate each model
results = []

for name, model in models.items():
    print(f"Training {name}...")

    if name == 'Neural Network Regression':
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred = model.predict(X_test)
        y_pred = y_pred.flatten()
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Convert continuous predictions and true values into binary categories
    y_pred_binary = np.abs(y_pred - y_test) <= threshold
    y_true_binary = np.abs(y_test - y_test) <= threshold  # This will be all True

    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    precision = precision_score(y_true_binary, y_pred_binary)

    results.append((name, mae, mse, r2, accuracy, precision))

    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print("\n" + "="*60 + "\n")

# Select the top-performing models based on initial results (e.g., top 3 based on R^2)
top_models = sorted(results, key=lambda x: x[3], reverse=True)[:3]

# Hyperparameter tuning for top models
tuned_models = {}

for name, _, _, _, _, _ in top_models:
    if name == 'Random Forest Regression':
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        grid_search = GridSearchCV(estimator=models[name], param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = RandomForestRegressor(**best_params)
        tuned_models[name] = best_model

    elif name == 'Gradient Boosting':
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0]
        }
        grid_search = GridSearchCV(estimator=models[name], param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = GradientBoostingRegressor(**best_params)
        tuned_models[name] = best_model

    elif name == 'eXtreme Gradient Boosting':
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0]
        }
        grid_search = GridSearchCV(estimator=models[name], param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_model = xgb.XGBRegressor(**best_params)
        tuned_models[name] = best_model

# Train and evaluate tuned models
for name, model in tuned_models.items():
    model.fit(X_train, y_train)
    y_pred_best = model.predict(X_test)

    mae_best = mean_absolute_error(y_test, y_pred_best)
    mse_best = mean_squared_error(y_test, y_pred_best)
    r2_best = r2_score(y_test, y_pred_best)

    y_pred_best_binary = np.abs(y_pred_best - y_test) <= threshold
    accuracy_best = accuracy_score(y_true_binary, y_pred_best_binary)
    precision_best = precision_score(y_true_binary, y_pred_best_binary)

    print(f"Best Model: {name}")
    print(f"Best Model Mean Absolute Error: {mae_best}")
    print(f"Best Model Mean Squared Error: {mse_best}")
    print(f"Best Model R^2 Score: {r2_best}")
    print(f"Best Model Accuracy: {accuracy_best}")
    print(f"Best Model Precision: {precision_best}")
    print("\n" + "="*60 + "\n")


Training Linear Regression...
Model: Linear Regression
Mean Absolute Error: 801.3586987066968
Mean Squared Error: 1671178.6209358748
R^2 Score: 0.07366543436660122
Accuracy: 0.007987220447284345
Precision: 1.0


Training Decision Tree...
Model: Decision Tree
Mean Absolute Error: 735.4447310969115
Mean Squared Error: 2843777.399083977
R^2 Score: -0.5763062480199195
Accuracy: 0.29233226837060705
Precision: 1.0


Training Support Vector Regression...
Model: Support Vector Regression
Mean Absolute Error: 513.0188346740177
Mean Squared Error: 2014455.5454631133
R^2 Score: -0.1166130174938178
Accuracy: 0.06230031948881789
Precision: 1.0


Training Gradient Boosting...
Model: Gradient Boosting
Mean Absolute Error: 598.3204277432214
Mean Squared Error: 1402396.4517690702
R^2 Score: 0.22265143191706194
Accuracy: 0.007987220447284345
Precision: 1.0


Training eXtreme Gradient Boosting...
Model: eXtreme Gradient Boosting
Mean Absolute Error: 561.7660127517437
Mean Squared Error: 1369966.444069211

  model = cd_fast.enet_coordinate_descent(


Model: Lasso Regression
Mean Absolute Error: 779.0633141169228
Mean Squared Error: 1629099.7015929585
R^2 Score: 0.09698978580547646
Accuracy: 0.007987220447284345
Precision: 1.0


Training Ridge Regression...
Model: Ridge Regression
Mean Absolute Error: 797.1247292781545
Mean Squared Error: 1659598.9182071858
R^2 Score: 0.08008406536329082
Accuracy: 0.004792332268370607
Precision: 1.0


Training ElasticNet Regression...
Model: ElasticNet Regression
Mean Absolute Error: 698.1144541193408
Mean Squared Error: 1551599.3934018398
R^2 Score: 0.13994821850997752
Accuracy: 0.009584664536741214
Precision: 1.0


Training K-Nearest Neighbors...
Model: K-Nearest Neighbors
Mean Absolute Error: 505.52031948881785
Mean Squared Error: 1422985.185641444
R^2 Score: 0.21123909357711446
Accuracy: 0.1853035143769968
Precision: 1.0


Training Random Forest Regression...
Model: Random Forest Regression
Mean Absolute Error: 599.5202575209189
Mean Squared Error: 1379913.0820989949
R^2 Score: 0.235113967169960

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "/home/redleaf/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/redleaf/.local/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/home/redleaf/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/redleaf/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constra

Best Model: eXtreme Gradient Boosting
Best Model Mean Absolute Error: 571.0530097324933
Best Model Mean Squared Error: 1379320.1282151337
Best Model R^2 Score: 0.23544264159863437
Best Model Accuracy: 0.0463258785942492
Best Model Precision: 1.0


Best Model: Random Forest Regression
Best Model Mean Absolute Error: 582.5843492666971
Best Model Mean Squared Error: 1305389.3905205105
Best Model R^2 Score: 0.2764224608300181
Best Model Accuracy: 0.012779552715654952
Best Model Precision: 1.0


