In [71]:
import pandas as pd
import sqlite3

# Read data from the SQLite database
conn = sqlite3.connect('MegaMillions.db')
query = "SELECT * FROM MegaMillions"
data = pd.read_sql_query(query, conn)
conn.close()

# Sort data by date or index (assuming there's a column named 'Date')
data.sort_values(by='Draw Date', inplace=True)  # Replace 'Date' with the appropriate column name

# Calculate the index to split the data (keeping the first 80%)
split_index = int(0.8 * len(data))
train_data = data.iloc[:split_index]
test_data = data.iloc[split_index:]  # Selecting the last 20% for testing

# Create a new SQLite database for testing features
test_conn = sqlite3.connect('MegaMillions_Test.db')

test_data.to_sql('Interval_Data', test_conn, if_exists='replace', index=False)

test_conn.close()


# Create a new SQLite database for training features
train_conn = sqlite3.connect('MegaMillions_Train.db')

train_data.to_sql('Interval_Data', train_conn, if_exists='replace', index=False)

# Split the Winning Numbers column into individual numbers
train_data = train_data.copy()
train_data['Numbers'] = train_data['Winning Numbers'].apply(lambda x: [int(num) for num in x.split()])


# Calculate Number Frequencies
number_frequencies = {}
for i in range(1, 6):
    col_name = f'Ball{i}'
    numbers = train_data['Numbers'].apply(lambda x: x[i - 1])
    number_counts = numbers.value_counts().to_dict()
    number_frequencies[col_name] = number_counts

# Calculate Hot Numbers
hot_numbers = {}
for col, counts in number_frequencies.items():
    hot_numbers[col] = sorted(counts, key=counts.get, reverse=True)[:5]  # Get top 5 most frequent numbers

# Calculate Cold Numbers
cold_numbers = {}
for col, counts in number_frequencies.items():
    cold_numbers[col] = sorted(counts, key=counts.get)[:5]  # Get top 5 least frequent numbers

# Save calculated features to the 'MegaMillions_Train.db' database
pd.DataFrame(number_frequencies).to_sql('NumberFrequencies', train_conn, index=False)
pd.DataFrame(hot_numbers).to_sql('HotNumbers', train_conn, index=False)
pd.DataFrame(cold_numbers).to_sql('ColdNumbers', train_conn, index=False)

# Calculate Overdue Numbers
overdue_numbers = {}
for col, counts in number_frequencies.items():
    last_number = train_data['Numbers'].apply(lambda x: x[i - 1]).iloc[-1]
    overdue_numbers[col] = [num for num, count in counts.items() if count == 1 and num < last_number]

# Save overdue numbers to the database if there are any
if any(overdue_numbers.values()):
    pd.DataFrame(overdue_numbers).to_sql('OverdueNumbers', train_conn, index=False)

# Calculate Transition Matrix
transition_matrix = pd.DataFrame(0, index=range(1, 76), columns=range(1, 76))

for numbers in train_data['Numbers']:
    for i in range(len(numbers) - 1):
        transition_matrix.loc[numbers[i], numbers[i+1]] += 1

transition_matrix.to_sql('TransitionMatrix', train_conn)

# Calculate Tightness Test
tightness_test = {}
for col, counts in number_frequencies.items():
    mean_frequency = sum(counts.values()) / len(counts)
    deviations = [(count - mean_frequency) ** 2 for count in counts.values()]
    standard_deviation = (sum(deviations) / len(deviations)) ** 0.5
    tightness_test[col] = standard_deviation

pd.DataFrame(tightness_test, index=[0]).to_sql('TightnessTest', train_conn, index=False)

train_conn.close()


In [78]:
import pandas as pd
import sqlite3

# Connect to the SQLite database
train_conn = sqlite3.connect('MegaMillions_Train.db')

# Get a cursor object
cursor = train_conn.cursor()

# Get all table names in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Output the data in each table
for table in tables:
    table_name = table[0]
    print(f"\nData in table '{table_name}':")

    
    # Fetch and print first 5 rows
    cursor.execute(f"SELECT * FROM {table_name} LIMIT 5;")
    rows = cursor.fetchall()
    for row in rows:
        print(row)

# Close the database connection
train_conn.close()



Data in table 'Interval_Data':
('01/01/2008', '13 16 25 30 54', 11, None)
('01/01/2008', '13 16 25 30 54', 11, None)
('01/01/2008', '13 16 25 30 54', 11, None)
('01/01/2010', '06 08 27 40 41', 21, None)
('01/01/2010', '06 08 27 40 41', 21, None)

Data in table 'NumberFrequencies':
(291.0, 21.0, None, None, None)
(279.0, 21.0, None, None, None)
(263.0, None, None, None, None)
(237.0, 66.0, 6.0, 3.0, None)
(234.0, 57.0, 6.0, None, None)

Data in table 'HotNumbers':
(2, 17, 31, 48, 50)
(3, 16, 28, 46, 52)
(1, 14, 26, 40, 55)
(7, 15, 30, 38, 56)
(4, 12, 25, 45, 64)

Data in table 'ColdNumbers':
(40, 59, 65, 12, 26)
(41, 61, 7, 10, 21)
(38, 62, 66, 8, 24)
(39, 51, 57, 7, 22)
(37, 48, 6, 5, 25)

Data in table 'TransitionMatrix':
(1, 0, 21, 9, 24, 12, 12, 6, 9, 18, 24, 6, 9, 9, 3, 5, 12, 9, 3, 6, 3, 6, 3, 0, 3, 0, 6, 9, 0, 3, 3, 3, 6, 0, 3, 0, 6, 3, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
(2, 0, 0, 12, 27, 21, 24, 18, 