In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
import numpy as np

# Load data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

def preprocess_data(data):
    # Convert date data to timestamps
    date_columns = ['Auction Date']
    for col in date_columns:
        data[col] = pd.to_datetime(data[col]).astype('int64') / 10 ** 9

    # Use LabelEncoder to process categorical data
    categorical_columns = ['Distillery', 'Brand', 'Type', 'Packing', 'Country']
    for col in categorical_columns:
        le = label_encoders.get(col)
        if le:
            data[col] = le.transform(data[col].astype(str))
        else:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
    return data

label_encoders = {}
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target variables
X_train = train_data.drop('Result', axis=1)
y_train = train_data['Result']

X_test = test_data.drop('Result', axis=1)
y_test = test_data['Result']

# Define XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define hyperparameter search space
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2],
    'reg_lambda': [0.5, 1, 1.5],
    'reg_alpha': [0, 0.1, 0.2]
}

# Use Repeated K-Fold Cross Validation for hyperparameter tuning
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=rkf, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Output best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Train the model with best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Coefficient of Determination (R^2): {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Visual analysis
plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Diagonal line
plt.title('Actual Value vs Predicted Value')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.show()

# 1. Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.title('Residual Plot')
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.axhline(y=0, color='red')
plt.show()

# 2. Histogram of residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, edgecolor='k')
plt.title('Histogram of Residuals')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.show()

# 3. Feature importance plot
feature_importances = model.feature_importances_
features = X_train.columns
plt.figure(figsize=(12, 8))
plt.barh(features, feature_importances, align='center')
plt.title('Feature Importance')
plt.xlabel('Significance')
plt.ylabel('Diagnostic Property')
plt.show()

print(feature_importances)

# 1. Box plots of predicted versus actual values
plt.figure(figsize=(10, 6))
plt.boxplot([y_test, y_pred], vert=False, labels=['Predicted Value', 'Actual Value'])
plt.title('Box Plots of Predicted versus Actual Values')
plt.xlabel('Value')
plt.show()

# 2. Learning curve
train_sizes, train_scores, valid_scores = learning_curve(XGBRegressor(), X_train, y_train, train_sizes=[0.1, 0.33, 0.55, 0.78, 1.], cv=5)
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', color="r", label="Training Score")
plt.plot(train_sizes, valid_scores.mean(axis=1), 'o-', color="g", label="Cross-validation Score")
plt.title('Learning Curve')
plt.xlabel('Number of Training Samples')
plt.ylabel('Score')
plt.legend(loc="best")
plt.show()

# 3. Validation curve (example using max_depth of XGBoost)
param_range = np.arange(1, 10, 1)
train_scores, valid_scores = validation_curve(XGBRegressor(), X_train, y_train, param_name="max_depth", param_range=param_range, cv=5)
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_scores.mean(axis=1), 'o-', color="r", label="Training Score")
plt.plot(param_range, valid_scores.mean(axis=1), 'o-', color="g", label="Cross-validation Score")
plt.title('Validation Curve - max_depth')
plt.xlabel('max_depth')
plt.ylabel('Score')
plt.legend(loc="best")
plt.show()

# Initialize JavaScript visualization code for SHAP
shap.initjs()

explainer = shap.TreeExplainer(model, feature_perturbation='interventional')

# Calculate SHAP values
shap_values = explainer.shap_values(X_train, check_additivity=False)

shap.summary_plot(shap_values, X_train)
for feature in X_train.columns:
    shap.dependence_plot(feature, shap_values, X_train, display_features=X_train)
# For the first instance in the dataset
shap.force_plot(explainer.expected_value, shap_values[0, :], X_train.iloc[0, :])


ModuleNotFoundError: No module named 'xgboost'