In [1]:
import pandas as pd
import sqlite3
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Connect to the SQLite database for training data
train_conn = sqlite3.connect('MegaMillions_Train.db')

# Query data from the database for training
query_train = "SELECT * FROM Interval_Data"
df_train = pd.read_sql_query(query_train, train_conn)

# Preprocess 'Draw Date' into separate day, month, and year columns for training data
df_train['Draw Date'] = pd.to_datetime(df_train['Draw Date'])
df_train['Day'] = df_train['Draw Date'].dt.day
df_train['Month'] = df_train['Draw Date'].dt.month
df_train['Year'] = df_train['Draw Date'].dt.year

# Preprocess 'Winning Numbers' for training data - Assuming they are space-separated numbers
winning_numbers_train = df_train['Winning Numbers'].str.split(' ', expand=True).astype(int)
winning_numbers_train.columns = [f'Number_{i+1}' for i in range(winning_numbers_train.shape[1])]

# Combine processed columns with the original DataFrame for training data
df_train_processed = pd.concat([df_train, winning_numbers_train], axis=1)

# Define features (X_train) and target variable (y_train) for training
X_train = df_train_processed[['Day', 'Month', 'Year', 'Multiplier', 'Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5']]
y_train = df_train_processed['Mega Ball']

# Initialize and train the random forest classifier for training data
clf = RandomForestClassifier(n_estimators=10000, random_state=42, max_depth=32)
clf.fit(X_train, y_train)

# Disconnect from the training data SQLite database
train_conn.close()

# Connect to the SQLite database for test data
test_conn = sqlite3.connect('MegaMillions_Test.db')

# Query data from the database for test
query_test = "SELECT * FROM Interval_Data"
df_test = pd.read_sql_query(query_test, test_conn)

# Preprocess 'Draw Date' into separate day, month, and year columns for test data
df_test['Draw Date'] = pd.to_datetime(df_test['Draw Date'])
df_test['Day'] = df_test['Draw Date'].dt.day
df_test['Month'] = df_test['Draw Date'].dt.month
df_test['Year'] = df_test['Draw Date'].dt.year

# Preprocess 'Winning Numbers' for test data - Assuming they are space-separated numbers
winning_numbers_test = df_test['Winning Numbers'].str.split(' ', expand=True).astype(int)
winning_numbers_test.columns = [f'Number_{i+1}' for i in range(winning_numbers_test.shape[1])]

# Combine processed columns with the original DataFrame for test data
df_test_processed = pd.concat([df_test, winning_numbers_test], axis=1)

# Define features (X_test) for the test set
X_test_processed = df_test_processed[['Day', 'Month', 'Year', 'Multiplier', 'Number_1', 'Number_2', 'Number_3', 'Number_4', 'Number_5']]

# Assuming you have y_test values in the test dataset
# Replace 'y_test_values_column' with the actual column containing the Mega Ball values in your test data
y_test_values = df_test_processed['Mega Ball']

# Predict on the test set (X_test_processed) using the trained classifier (clf)
predictions_test = clf.predict(X_test_processed)

# Evaluate accuracy for the test set (if y_test_values are available)
accuracy_test = accuracy_score(y_test_values, predictions_test)
print(f"Test Accuracy: {accuracy_test}")

# Disconnect from the test data SQLite database
test_conn.close()

Test Accuracy: 0.08056872037914692
