In [2]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib


# Connect to the SQLite database for training data
train_conn = sqlite3.connect('MegaMillions_Train.db')

# Query data from the database for training
query_train = "SELECT * FROM Interval_Data"
df_train = pd.read_sql_query(query_train, train_conn)

# Connect to the SQLite database for test data
test_conn = sqlite3.connect('MegaMillions_Test.db')

# Query data from the database for test
query_test = "SELECT * FROM Interval_Data"
df_test = pd.read_sql_query(query_test, test_conn)

In [31]:
def generate_numbers(model, features, num_sets):
    generated_sets = []
    for _ in range(num_sets):
        predicted_numbers = model.predict(features)
        generated_sets.append(predicted_numbers)
    return generated_sets

def train_and_evaluate(num_guesses):
    results = []
    for i in range(num_guesses):
        # Sample a subset of the data for training
        sample_train = df_train.sample(frac=0.8, replace=True)  # You can adjust the fraction as needed

        # Preprocess 'Draw Date' into separate day, month, and year columns for training data
        sample_train['Draw Date'] = pd.to_datetime(sample_train['Draw Date'])
        sample_train['Day'] = sample_train['Draw Date'].dt.day
        sample_train['Month'] = sample_train['Draw Date'].dt.month
        sample_train['Year'] = sample_train['Draw Date'].dt.year

        # Preprocess 'Winning Numbers' for training data - Assuming they are space-separated numbers
        winning_numbers_train = sample_train['Winning Numbers'].str.split(' ', expand=True).astype(int)
        winning_numbers_train.columns = [f'Number_{i + 1}' for i in range(winning_numbers_train.shape[1])]

        # Combine processed columns with the original DataFrame for training data
        sample_train_processed = pd.concat([sample_train, winning_numbers_train], axis=1)

        # Define features (X_train) and target variables (y_train) for training
        X_train = sample_train_processed[['Day', 'Month', 'Year']]
        y_train = sample_train_processed[['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball', 'Multiplier']]

        # Train a single model to predict all numbers together, including Mega Ball
        combined_model = RandomForestRegressor(n_estimators=100, max_depth=16, random_state=42)
        combined_model.fit(X_train, y_train)

        # Save the trained combined model to a file (if needed)
        joblib.dump(combined_model, f'MegaMillions_Model_{i + 1}.pkl')
        
        # -------------------- TEST MODEL ------------------------------------

        # Evaluate the model's performance
        # Assuming df_test contains the test data
        sample_test = df_test.sample(frac=0.2)  # Sample test data
        sample_test['Draw Date'] = pd.to_datetime(sample_test['Draw Date'])
        sample_test['Day'] = sample_test['Draw Date'].dt.day
        sample_test['Month'] = sample_test['Draw Date'].dt.month
        sample_test['Year'] = sample_test['Draw Date'].dt.year
        winning_numbers_test = sample_test['Winning Numbers'].str.split(' ', expand=True).astype(int)
        winning_numbers_test.columns = [f'Number_{i + 1}' for i in range(winning_numbers_test.shape[1])]
        sample_test_processed = pd.concat([sample_test, winning_numbers_test], axis=1)
        X_test = sample_test_processed[['Day', 'Month', 'Year']]
        y_test = sample_test_processed[['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball', 'Multiplier']]
        predictions_test_combined = combined_model.predict(X_test)
        predictions_test = pd.DataFrame(predictions_test_combined, columns=['Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5', 'Mega Ball', 'Multiplier'])
        
        # Round the predicted values
        rounded_predictions = predictions_test.round().astype(int)
        
         # Check for correct predictions
        correct_predictions = []
        for index in y_test.index:
            if index in rounded_predictions.index:
                actual_numbers = set(y_test.loc[index, 'Number_1':'Number_5'])  # Extract white ball numbers
                predicted_numbers = set(rounded_predictions.loc[index, 'Number_1':'Number_5'])  # Extract rounded predicted numbers
                if any(num in predicted_numbers for num in actual_numbers):
                    correct_predictions.append(index)

        print("\nIndices with at least one correct prediction:")
        print(correct_predictions)
        
        if correct_predictions:
            print("\nValues for these indices with matched numbers:")
            for index in correct_predictions:
                actual_numbers = set(y_test.loc[index, 'Number_1':'Number_5'])  # Extract white ball numbers
                predicted_numbers = set(rounded_predictions.loc[index, 'Number_1':'Number_5'])  # Extract rounded predicted numbers
                matched_numbers = actual_numbers.intersection(predicted_numbers)
                print(f"Index {index}: Matched Numbers - {matched_numbers}")
        
        mse_combined = mean_squared_error(y_test, predictions_test)
        rmse_combined = np.sqrt(mse_combined)
        
        # Define the possible range based on the lottery rules
        white_ball_range = 70  # Example: Range of white ball numbers (1-70)
        mega_ball_range = 25   # Example: Range of Mega Ball numbers (1-25)

        possible_range = white_ball_range + mega_ball_range  # Total possible range for all numbers

        percentage_error_combined = (rmse_combined / possible_range) * 100

        print(f"Iteration {i + 1}:")
        print("=============================================")

    return results

# Set the number of guesses you want to generate and evaluate
num_guesses = 5
results = train_and_evaluate(num_guesses)


Indices with at least one correct prediction:
[17, 126, 60, 11, 122, 57, 54, 22]

Values for these indices with matched numbers:
Index 17: Matched Numbers - {37}
Index 126: Matched Numbers - {26}
Index 60: Matched Numbers - {35}
Index 11: Matched Numbers - {57}
Index 122: Matched Numbers - {44, 22}
Index 57: Matched Numbers - {24}
Index 54: Matched Numbers - {49}
Index 22: Matched Numbers - {45}
Iteration 1:

Indices with at least one correct prediction:
[49, 78, 80, 32, 54, 73, 18, 4, 45]

Values for these indices with matched numbers:
Index 49: Matched Numbers - {13}
Index 78: Matched Numbers - {38}
Index 80: Matched Numbers - {38}
Index 32: Matched Numbers - {17}
Index 54: Matched Numbers - {24, 9}
Index 73: Matched Numbers - {34, 13, 46}
Index 18: Matched Numbers - {48}
Index 4: Matched Numbers - {13}
Index 45: Matched Numbers - {24, 34}
Iteration 2:

Indices with at least one correct prediction:
[118, 33, 101, 20, 90, 113, 22, 115]

Values for these indices with matched numbers:
