Import Library

In [18]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

Proses Scrapping Data

In [19]:
all_df = []

for i in range (2015, 2026):
    url = f'https://lottery.hk/en/mark-six/results/{i}'
    
    try:
        data = pd.read_html(url)
        data = data[0]
        all_df.append(data)

        print(f'Success Collected {data.shape[0]} Data {i}')
    except Exception as e:
        print(f'Error: {e}')

print(f'Scrapping done!' )


Success Collected 164 Data 2015
Success Collected 163 Data 2016
Success Collected 165 Data 2017
Success Collected 161 Data 2018
Success Collected 156 Data 2019
Success Collected 36 Data 2020
Success Collected 135 Data 2021
Success Collected 123 Data 2022
Success Collected 158 Data 2023
Success Collected 152 Data 2024
Success Collected 123 Data 2025
Scrapping done!


Penggabungan Data Hasil Scrapping

In [20]:
if all_df:
    df = pd.concat(all_df, ignore_index=True)
    print(df.shape)
else:
    print("Tidak ada data yang tersedia")

(1536, 4)


Proses Menjadikan List ke Dataframe

In [21]:
df = pd.DataFrame(df, columns=["Draw Number", "Draw Date", "Balls Drawn", "Detail"])

Pengubahan Struktur pada Kolom Date  

In [22]:
df['Draw Date'] = pd.to_datetime(df['Draw Date'], format='%d/%m/%Y', errors='coerce')

Proses Menghilangkan Baris Tidak Penting

In [23]:
df = df[~df['Balls Drawn'].str.contains('January|February|March|April|May|June|July|August|September|October|November|December', case=False, na=False)]
df = df.drop(columns=['Detail', 'Draw Number'])

One Hot Encoding pada Fitur Ball Draws

In [24]:
df['Balls_List'] = df['Balls Drawn'].apply(lambda x: [int(n) for n in x.split()])
mlb = MultiLabelBinarizer(classes=range(1, 50))

balls_encoded = mlb.fit_transform(df['Balls_List'])

balls_onehot = pd.DataFrame(
    balls_encoded,
    columns=[f'num_{i}' for i in range(1, 50)],
    index=df.index
)

balls_onehot['ball_draws'] = df['Balls Drawn']

df = pd.concat([df[['Draw Date']], balls_onehot], axis=1)


Pemisahan Special Number & Regular Number

In [25]:
split_cols = df['ball_draws'].str.split(expand=True)

df['special_num'] = split_cols.iloc[:, -1]
df['ball_draws'] = split_cols.iloc[:, :-1].apply(lambda x: ' '.join(x.dropna()), axis=1)

Pengubahan Nama Draw Date

In [26]:
df = df.rename(columns={'Draw Date':'date'})
df['special_num'] = pd.to_numeric(df['special_num'])

Proses Sorting Berdasarkan Date

In [27]:
df.sort_values(by='date', ascending=True, inplace=True)

Feauture Extraction pada Kolom Date

In [28]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_year'] = df['date'].dt.dayofyear

Penyetelan Ulang Index Dataset

In [29]:
df = df.set_index('date')

Features Extraction untuk Menentukan Frekuensi Bola yang Keluar

In [None]:
main_balls_list = df['ball_draws'].str.split(' ').apply(lambda x: [int(n) for n in x])
special_ball_list = df['special_num'].apply(lambda x: [int(x)])
all_numbers_list = main_balls_list + special_ball_list

mlb = MultiLabelBinarizer(classes=range(1, 50))
y_array = mlb.fit_transform(all_numbers_list)

y_binary = pd.DataFrame(y_array, columns=mlb.classes_, index=df.index)

cumulative_counts = y_binary.cumsum()
shifted_counts = cumulative_counts.shift(1)
X_features_counts = shifted_counts.fillna(0).astype(int)

X_features_counts.columns = [f'b{i}_count' for i in range(1, 50)]

df = df.join(X_features_counts)

#fitur total dan rata-rata
df['total_sum'] = main_balls_list.apply(sum)
df['mean_num'] = main_balls_list.apply(lambda x: sum(x)/len(x))

#fitur jumlah angka genap dan ganjil
df['even_count'] = main_balls_list.apply(lambda x: sum(num % 2 == 0 for num in x))
df['odd_count'] = main_balls_list.apply(lambda x: sum(num % 2 != 0 for num in x))

Melihat Kolom

In [31]:
df.columns

Index(['num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8',
       'num_9', 'num_10',
       ...
       'b44_count', 'b45_count', 'b46_count', 'b47_count', 'b48_count',
       'b49_count', 'total_sum', 'mean_num', 'even_count', 'odd_count'],
      dtype='object', length=108)

Penyetelan pada Kolom ball_draws dan special_num untuk Diposisikan di Bagian Paling Kiri

In [32]:
cols_to_move = ['ball_draws', 'special_num']

all_cols = df.columns.tolist()
other_cols = [c for c in all_cols if c not in cols_to_move]
new_order = cols_to_move + other_cols

df = df[new_order]

Export to CSV

In [33]:
df.to_csv('lottery_hk(3).csv')