In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
dataset = pd.read_csv('DSDataLastThreeMonths.csv')

# Select relevant columns for the model
selected_columns = ['HM_WT', 'AIM_S', 'HM_S', 'HM_C', 'HM_SI', 'HM_TI', 'HM_MN', 
                    'CAC2', 'MG', 'HM_TEMP', 'CAC2_INJ_TIME', 'MG_INJ_TIME', 'DS_S']

dataset = dataset[selected_columns]

# Drop rows with missing values in both X and y
dataset = dataset.dropna()

# Split the dataset into input features (X) and target variable (y)
X = dataset.drop('DS_S', axis=1)
y = dataset['DS_S']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Calculate the R-squared (R2) score
r2 = r2_score(y_test, y_pred)

# Calculate the Model hit rate (% data point with (Pred DS_S – Act DS_S) between +- 0.003%)
tolerance = 0.003
within_tolerance = abs(y_pred - y_test) <= tolerance
hit_rate = (within_tolerance.sum() / len(y_test)) * 100

# Calculate the Strike rate (percentage of hits within the tolerance range)
strike_rate = (within_tolerance.sum() / len(y_pred)) * 100

# Share summary of the model's performance
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Model hit rate: {hit_rate:.2f}%")
print(f"Strike rate: {strike_rate:.2f}%")

Root Mean Squared Error (RMSE): 0.0022
R-squared (R2) score: 0.3766
Model hit rate: 85.15%
Strike rate: 85.15%
