In [17]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib


# Connect to the SQLite database for training data
train_conn = sqlite3.connect('MegaMillions_Train.db')

# Query data from the database for training
query_train = "SELECT * FROM Interval_Data"
df_train = pd.read_sql_query(query_train, train_conn)

# Connect to the SQLite database for test data
test_conn = sqlite3.connect('MegaMillions_Test.db')

# Query data from the database for test
query_test = "SELECT * FROM Interval_Data"
df_test = pd.read_sql_query(query_test, test_conn)

# ------------------- TRAIN --------------------

# Preprocess 'Draw Date' into separate day, month, and year columns for training data
df_train['Draw Date'] = pd.to_datetime(df_train['Draw Date'])
df_train['Day'] = df_train['Draw Date'].dt.day
df_train['Month'] = df_train['Draw Date'].dt.month
df_train['Year'] = df_train['Draw Date'].dt.year

# Preprocess 'Winning Numbers' for training data - Assuming they are space-separated numbers
winning_numbers_train = df_train['Winning Numbers'].str.split(' ', expand=True).astype(int)
winning_numbers_train.columns = [f'Number_{i+1}' for i in range(winning_numbers_train.shape[1])]

# Combine processed columns with the original DataFrame for training data
df_train_processed = pd.concat([df_train, winning_numbers_train], axis=1)

# Define features (X_train) and target variables (y_train) for training
X_train = df_train_processed[['Day', 'Month', 'Year', 'Multiplier']]
y_train = df_train_processed[['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball']] 

# Initialize and train the random forest regressor for each number separately, including Mega Ball
models = {}
for col in y_train.columns:
    model = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=32)
    model.fit(X_train, y_train[col])
    models[col] = model
    # Save each trained model to a file
    joblib.dump(model, f'{col}_model.pkl')

# Disconnect from the training data SQLite database
train_conn.close()

# ------------------ TEST -------------------

# Preprocess 'Draw Date' into separate day, month, and year columns for test data
df_test['Draw Date'] = pd.to_datetime(df_test['Draw Date'])
df_test['Day'] = df_test['Draw Date'].dt.day
df_test['Month'] = df_test['Draw Date'].dt.month
df_test['Year'] = df_test['Draw Date'].dt.year

# Preprocess 'Winning Numbers' for test data - Assuming they are space-separated numbers
winning_numbers_test = df_test['Winning Numbers'].str.split(' ', expand=True).astype(int)
winning_numbers_test.columns = [f'Number_{i+1}' for i in range(winning_numbers_test.shape[1])]

# Combine processed columns with the original DataFrame for test data
df_test_processed = pd.concat([df_test, winning_numbers_test], axis=1)

# Define features (X_test) and target variables (y_test) for test
X_test = df_test_processed[['Day', 'Month', 'Year', 'Multiplier']]
y_test = df_test_processed[['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball']] # Include Mega Ball in target

# Make predictions for each winning number including the Mega Ball using the trained models
predictions_test = pd.DataFrame()
for col, model in models.items():
    predictions_test[col] = model.predict(X_test)

# Evaluate the performance of the predictions (if actual values are available)
# Assuming you have actual y_test values in the test dataset
# Replace 'actual_numbers' with the actual columns containing the winning numbers in your test data
actual_numbers = df_test_processed[['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball']]
mse = mean_squared_error(actual_numbers, predictions_test)
print(f"Mean Squared Error for each number including Mega Ball: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) including Mega Ball: {rmse}")

# Calculate Percentage Error relative to the range of possible values (1 to 70)
possible_range = 70  # Assuming the lottery numbers range from 1 to 70
percentage_error = (rmse / possible_range) * 100
print(f"Percentage Error relative to the range of possible values including Mega Ball: {percentage_error:.2f}%")


Mean Squared Error for each number including Mega Ball: 156.35190275513426
Root Mean Squared Error (RMSE) including Mega Ball: 12.50407544583502
Percentage Error relative to the range of possible values including Mega Ball: 16.67%
