In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import streamlit as st
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('EuroMillions_numbers.csv',delimiter = ';')

In [3]:
df['Date'] = pd.to_datetime(df['Date'],format= '%Y-%m-%d')

In [4]:
df1 = pd.read_excel('lot_21.xlsx')

In [5]:
from datetime import datetime
import re
def clean_date(i):
    i = str(i)
    i = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', i)
    return datetime.strptime(i, "%A %d %B %Y").strftime("%Y-%m-%d")


In [6]:
df1['Date'] = df1['Date'].apply(str)
df1['Date'] = df1['Date'].apply(clean_date)

In [None]:
df1.rename(columns = {'Money':'Gain'},inplace = True)

In [13]:
df_ex = pd.read_excel('lot_21.xlsx')

In [15]:
df_ex

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2,Ticket,Money
0,Friday 31st December 2021,7,3,25,43,49,6,7,10 Millionaire Maker Codes,14788260
1,Tuesday 28th December 2021,4,6,15,17,29,9,12,XMPF 88607,66903928
2,Friday 24th December 2021,2,26,28,34,41,5,10,XHMH 77368,58741896
3,Tuesday 21st December 2021,12,17,21,34,38,2,3,XKNF 58280,44829384
4,Friday 17th December 2021,1,2,15,30,35,2,7,JHML 19422,35175553
...,...,...,...,...,...,...,...,...,...,...
410,Tuesday 16th January 2024,10,18,21,33,45,8,12,XFKD 58687,75779937
411,Friday 12th January 2024,16,17,18,45,49,9,12,10 Millionaire Maker Codes,67303737
412,Tuesday 9th January 2024,2,9,12,39,40,1,3,HCJM 97398,55675221
413,Friday 5th January 2024,4,7,18,39,50,3,8,HBHT 10647,46379442


In [18]:
df_all = df[~df.Date.isin(df_ex)]

In [20]:
dfx = pd.concat([df1,df_all],ignore_index = True)

In [22]:
dfx.head()

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2,Ticket,Money,Winner,Gain
0,2021-12-31,7,3,25,43,49,6,7,10 Millionaire Maker Codes,14788260.0,,
1,2021-12-28,4,6,15,17,29,9,12,XMPF 88607,66903928.0,,
2,2021-12-24,2,26,28,34,41,5,10,XHMH 77368,58741896.0,,
3,2021-12-21,12,17,21,34,38,2,3,XKNF 58280,44829384.0,,
4,2021-12-17,1,2,15,30,35,2,7,JHML 19422,35175553.0,,


In [24]:
dfx = dfx.drop('Gain',axis = 1)
dfx = dfx.drop('Ticket',axis = 1)
dfx = dfx.drop('Winner',axis = 1)

In [157]:
dfx['Date'] = pd.to_datetime(df['Date'],format= '%Y-%m-%d')

In [161]:
dfx = dfx.sort_values(by='Date')
dfx.head()

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2
4,2004-01-10,1,2,15,30,35,2,7
12,2004-02-13,22,26,38,47,50,2,6
7,2004-04-06,22,31,38,46,47,7,11
11,2004-05-03,17,21,36,42,46,7,10
9,2004-05-14,19,20,26,33,43,1,4


In [163]:
dfx['dayofWeek'] = dfx['Date'].dt.dayofweek

In [165]:
dfx['TimeIndex'] = np.arange(len(dfx))   

In [169]:
dfx = dfx.reset_index(drop=True)

In [171]:
dfx.head()

Unnamed: 0,Date,N1,N2,N3,N4,N5,E1,E2,dayofWeek,TimeIndex
0,2004-01-10,1,2,15,30,35,2,7,5.0,0
1,2004-02-13,22,26,38,47,50,2,6,4.0,1
2,2004-04-06,22,31,38,46,47,7,11,1.0,2
3,2004-05-03,17,21,36,42,46,7,10,0.0,3
4,2004-05-14,19,20,26,33,43,1,4,4.0,4


In [36]:
for col in ['N1', 'N2', 'N3', 'N4', 'N5', 'E1', 'E2']:
    dfx[f'{col}_lag1'] = dfx[col].shift(1)
    dfx[f'{col}_lag2'] = dfx[col].shift(2)
    dfx[f'{col}_rolling3'] = dfx[col].rolling(window=3).mean()
    dfx[f'{col}_rolling5'] = dfx[col].rolling(window=5).mean()

In [37]:
dfx = dfx.dropna()

In [38]:
X = dfx.drop(columns=['Date', 'N1', 'N2', 'N3', 'N4', 'N5', 'E1', 'E2'])
y = dfx[['N1', 'N2', 'N3', 'N4', 'N5', 'E1', 'E2']]

In [39]:
train_X, test_X = X[:-20], X[-20:]
train_y, test_y = y[:-20], y[-20:]

In [40]:
model = MultiOutputRegressor(XGBRegressor(random_state=42))
model.fit(train_X, train_y)

In [41]:
predictions = model.predict(test_X)
predictions = np.round(predictions).astype(int)  # Ensure integer predictions

In [42]:
predictions[:, 0:5] = np.clip(predictions[:, 0:5], 1, 50)  # Constrain N1-N5 to [1, 50]
predictions[:, 5:7] = np.clip(predictions[:, 5:7], 1, 12)  # Constrain E1-E2 to [1, 12]

In [43]:
rmse = np.sqrt(mean_squared_error(test_y, predictions, multioutput='raw_values'))
print(f"RMSE for each number: {rmse}")

RMSE for each number: [1.91049732 2.14476106 2.91547595 1.51657509 1.7175564  0.54772256
 0.59160798]


In [44]:
comparison = pd.DataFrame({
    'Date': dfx['Date'][-20:].values,
    'Actual_N1': test_y['N1'].values,
    'Predicted_N1': predictions[:, 0],
    'Actual_N2': test_y['N2'].values,
    'Predicted_N2': predictions[:, 1],
    'Actual_N3': test_y['N3'].values,
    'Predicted_N3': predictions[:, 2],
    'Actual_N4': test_y['N4'].values,
    'Predicted_N4': predictions[:, 3],
    'Actual_N5': test_y['N5'].values,
    'Predicted_N5': predictions[:, 4],
    'Actual_E1': test_y['E1'].values,
    'Predicted_E1': predictions[:, 5],
    'Actual_E2': test_y['E2'].values,
    'Predicted_E2': predictions[:, 6],
})

print("\nComparison of Actual vs Predicted Numbers:")
print(comparison)


Comparison of Actual vs Predicted Numbers:
         Date  Actual_N1  Predicted_N1  Actual_N2  Predicted_N2  Actual_N3  \
0  2024-10-15          2             3         15            14         32   
1  2024-10-18          4             6         15            16         23   
2  2024-10-22          3             5         13            13         27   
3  2024-10-25          4             6         17            18         20   
4  2024-10-29          1             3          2             4         19   
5  2024-11-01          7             6         33            27         38   
6  2024-11-05          1             2          8             8         19   
7  2024-11-08          2             5         33            29         35   
8  2024-11-12         10             9         11            15         12   
9  2024-11-15          7             6          8             9         34   
10 2024-11-19          4             6         13            14         20   
11 2024-11-22       