In [35]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from itertools import combinations
import json
import requests
import math
from typing import Optional, List, Dict
from operator import itemgetter

from pydantic import BaseModel
from typing import Mapping
import numpy as np

In [36]:
url = 'https://resultados.intralot.com.pe/i.do?m=historico&t=0&s=41'
response = requests.get('https://resultados.intralot.com.pe/i.do?m=historico&t=0&s=41', verify=False)
soup = BeautifulSoup(response.text, 'html.parser')

historico = soup.find(id = 'historico')
rows = historico.find_all('tr')

lst = []
for row in rows:
  cols = [data.text.strip() for data in row.findAll('td')]
  lst.append(cols)
with open('./tinkache.json', 'w') as file: 
  json.dump(lst, file)

dfRows = pd.DataFrame(lst, columns=['fecha', 'sorteo', 'bolillas', 'yapa', 'adicionales', 'sorteo_extra'])
dfRows.head(5)



Unnamed: 0,fecha,sorteo,bolillas,yapa,adicionales,sorteo_extra
0,17/09/2023,1021,36 05 47 46 07 08,14,02 01 18,Promoción Sí o Sí
1,13/09/2023,1020,42 28 41 48 24 19,11,32 10,Promoción Sí o Sí
2,10/09/2023,1019,34 41 47 07 20 03,16,22,Promoción Sí o Sí
3,06/09/2023,1018,28 29 46 27 48 06,8,47 14,Promoción Sí o Sí
4,03/09/2023,1017,09 24 38 33 29 46,26,34 15 42,Promoción Sí o Sí


In [42]:
PRIME_NUMBERS = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59]
def isoFecha (row):
  dt_fecha = datetime.strptime(row['fecha'], "%d/%m/%Y")
  return dt_fecha.strftime("%Y-%m-%d")

def bolillasToArray (row):
  result = row['bolillas'].split(' ')
  return sorted(
    [str(r) for r in result]
  )

def str_to_int_bolillas (row):
  return [int(s) for s in row['sorted_bolillas']]

def bolillasPosition (row, index):
  return row['sorted_bolillas'][index]

def cond_range_b1 (row):
  return int(row['b1'] <= '09')

def cond_range_b2 (row):
  return int(row['b2'] <= '20')

def cond_range_b3 (row):
  return int(row['b3'] <= '30')

def cond_range_b4 (row):
  return int(row['b4'] > '20')

def cond_range_b5 (row):
  return int(row['b5'] > '30')

def cond_range_b6 (row):
  return int(row['b6'] >= '35')

def cond_sum_consecutives (row):
  consecutive_count = 0
  bs = row['int_sorted_bolillas']
  for i in range(5):
    if (bs[i + 1] - bs[i] == 1):
      consecutive_count += 1
  # * Si no hay consecutivos, +1. Si hay 1 consecutivo, 0. Si hay mas de 1 consecutivo, restar uno y retornar el negativo
  return -(consecutive_count - 1)

def cond_has_symetrics (row):
  symetric_count = 0
  bs = row['int_sorted_bolillas']
  for i in range(4):
    left = bs[i]
    center = bs[i + 1]
    right = bs[i + 2]
    if (center - left == right - center):
      symetric_count += 1
  return -(symetric_count - 1)

def symetric_factors (row):
  symetric_factors = []
  bs = row['int_sorted_bolillas']
  for i in range(4):
    left = bs[i]
    center = bs[i + 1]
    right = bs[i + 2]
    if (center - left == right - center):
      symetric_factors.append(center - left)
  return symetric_factors

def cond_has_prime (row):
  bs = row['int_sorted_bolillas']
  for i in range(6):
    if (bs[i] in PRIME_NUMBERS):
      return 1
  return 0

def primes_count (row):
  counter = 0
  bs = row['int_sorted_bolillas']
  for i in range(6):
    if (bs[i] in PRIME_NUMBERS):
      counter += 1
  return counter

def rank_scale (row, index, scale):
  b = row['int_sorted_bolillas'][index]
  return math.trunc(b/scale) + 1

def distance_factor (row, index):
  first = row['int_sorted_bolillas'][index]
  second = row['int_sorted_bolillas'][index + 1]
  return second - first

def distance_factor_past (row, index):
  if (row.name + 1 in dfRows.index):
    first = row['int_sorted_bolillas'][index]
    past = dfRows.iloc[row.name + 1]['int_sorted_bolillas'][index]
    return first - past
  else:
    return -99999

def get_post_bolilla (row, col):
  if (row.name -1 in dfRows.index):
    return dfRows.iloc[row.name - 1][col]

def cr_num_counter (row, target_char: str):
  bs = str(int(row['b1'])) + str(int(row['b2'])) + str(int(row['b3'])) + str(int(row['b4'])) + str(int(row['b5'])) + str(int(row['b6']))
  counter = 0
  for _char in bs:
    if (_char == target_char):
      counter += 1
  return counter

dfRows['iso_fecha'] = dfRows.apply (lambda row: isoFecha(row), axis=1)
dfRows['sorted_bolillas'] = dfRows.apply (lambda row: bolillasToArray(row), axis=1)
dfRows['int_sorted_bolillas'] = dfRows.apply (lambda row: str_to_int_bolillas(row), axis=1)

dfRows['b1'] = dfRows.apply (lambda row: bolillasPosition(row, 0), axis=1)
dfRows['b2'] = dfRows.apply (lambda row: bolillasPosition(row, 1), axis=1)
dfRows['b3'] = dfRows.apply (lambda row: bolillasPosition(row, 2), axis=1)
dfRows['b4'] = dfRows.apply (lambda row: bolillasPosition(row, 3), axis=1)
dfRows['b5'] = dfRows.apply (lambda row: bolillasPosition(row, 4), axis=1)
dfRows['b6'] = dfRows.apply (lambda row: bolillasPosition(row, 5), axis=1)

dfRows['crb1'] = dfRows.apply (lambda row: cond_range_b1(row), axis=1)
dfRows['crb2'] = dfRows.apply (lambda row: cond_range_b2(row), axis=1)
dfRows['crb3'] = dfRows.apply (lambda row: cond_range_b3(row), axis=1)
dfRows['crb4'] = dfRows.apply (lambda row: cond_range_b4(row), axis=1)
dfRows['crb5'] = dfRows.apply (lambda row: cond_range_b5(row), axis=1)
dfRows['crb6'] = dfRows.apply (lambda row: cond_range_b6(row), axis=1)

dfRows['cr_consecutivos'] = dfRows.apply (lambda row: cond_sum_consecutives(row), axis=1)
dfRows['cr_nprimos'] = dfRows.apply (lambda row: cond_has_prime(row), axis=1)
dfRows['cr_symetric'] = dfRows.apply (lambda row: cond_has_symetrics(row), axis=1)
dfRows['symetric_factors'] = dfRows.apply (lambda row: symetric_factors(row), axis=1)
dfRows['primes_count'] = dfRows.apply (lambda row: primes_count(row), axis=1)

dfRows['cr_bolillas'] = dfRows['crb1'] + dfRows['crb2'] + dfRows['crb3'] + dfRows['crb4'] + dfRows['crb5'] + dfRows['crb6']
dfRows['cr_aditional'] = dfRows['cr_consecutivos'] + dfRows['cr_nprimos'] + dfRows['cr_symetric']
dfRows['cr_total'] = dfRows['cr_bolillas'] + dfRows['cr_aditional']

dfRows['post_b1'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b1'), axis=1)
dfRows['post_b2'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b2'), axis=1)
dfRows['post_b3'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b3'), axis=1)
dfRows['post_b4'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b4'), axis=1)
dfRows['post_b5'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b5'), axis=1)
dfRows['post_b6'] = dfRows.apply (lambda row: get_post_bolilla(row, 'b6'), axis=1)
# * Esto ya lo tengo en el generador
# dfRows['cr_1_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '1'), axis=1)
# dfRows['cr_2_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '2'), axis=1)
# dfRows['cr_3_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '3'), axis=1)
# dfRows['cr_4_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '4'), axis=1)
# dfRows['cr_5_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '5'), axis=1)
# dfRows['cr_6_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '6'), axis=1)
# dfRows['cr_7_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '7'), axis=1)
# dfRows['cr_8_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '8'), axis=1)
# dfRows['cr_9_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '9'), axis=1)
# dfRows['cr_0_counter'] = dfRows.apply (lambda row: cr_num_counter(row, '0'), axis=1)
# dfRows['cr_1234567890_counter'] = dfRows['cr_1_counter'] + dfRows['cr_2_counter'] \
#                                 + dfRows['cr_3_counter'] + dfRows['cr_4_counter'] + dfRows['cr_5_counter'] \
#                                 + dfRows['cr_6_counter'] + dfRows['cr_7_counter'] + dfRows['cr_8_counter'] + dfRows['cr_9_counter'] + dfRows['cr_0_counter']

# * Incluir la diferencia entre bolillas
# dfRows['distance_b1_b2'] = dfRows.apply (lambda row: distance_factor(row, 0), axis=1)
# dfRows['distance_b2_b3'] = dfRows.apply (lambda row: distance_factor(row, 1), axis=1)
# dfRows['distance_b3_b4'] = dfRows.apply (lambda row: distance_factor(row, 2), axis=1)
# dfRows['distance_b4_b5'] = dfRows.apply (lambda row: distance_factor(row, 3), axis=1)
# dfRows['distance_b5_b6'] = dfRows.apply (lambda row: distance_factor(row, 4), axis=1)

# * Incluir la bolilla de la jugada anterior
# dfRows['distance_b1_pastb1'] = dfRows.apply (lambda row: distance_factor_past(row, 0), axis=1)
# dfRows['distance_b2_pastb2'] = dfRows.apply (lambda row: distance_factor_past(row, 1), axis=1)
# dfRows['distance_b3_pastb3'] = dfRows.apply (lambda row: distance_factor_past(row, 2), axis=1)
# dfRows['distance_b4_pastb4'] = dfRows.apply (lambda row: distance_factor_past(row, 3), axis=1)
# dfRows['distance_b5_pastb5'] = dfRows.apply (lambda row: distance_factor_past(row, 4), axis=1)
# dfRows['distance_b6_pastb6'] = dfRows.apply (lambda row: distance_factor_past(row, 4), axis=1)

# * Incluir el rankeo de 5 (puede servir para auto-ml)
# dfRows['scale_rank5_b1'] = dfRows.apply (lambda row: rank_scale(row, index=0, scale=5), axis=1)
# dfRows['scale_rank5_b2'] = dfRows.apply (lambda row: rank_scale(row, index=1, scale=5), axis=1)
# dfRows['scale_rank5_b3'] = dfRows.apply (lambda row: rank_scale(row, index=2, scale=5), axis=1)
# dfRows['scale_rank5_b4'] = dfRows.apply (lambda row: rank_scale(row, index=3, scale=5), axis=1)
# dfRows['scale_rank5_b5'] = dfRows.apply (lambda row: rank_scale(row, index=4, scale=5), axis=1)
# dfRows['scale_rank5_b6'] = dfRows.apply (lambda row: rank_scale(row, index=5, scale=5), axis=1)


dfRows.to_excel('df_rows.xlsx')
dfRows.head(10)
# ! El sgte sera b1> 1 o 2
# ! El sgte sera b2> Sera <18
# ! El sgte sera b3> ?
# ! El sgte sera b4> ?
# ! El sgte sera b5> Normal de 35 - 44
# ! El sgte sera b6> Normal de 35 - 45

# * Sin simetria, sin consecutivos y al menos 1 primo. crtotal = 9

Unnamed: 0,fecha,sorteo,bolillas,yapa,adicionales,sorteo_extra,iso_fecha,sorted_bolillas,int_sorted_bolillas,b1,...,post_bolilla_b3,post_bolilla_b4,post_bolilla_b5,post_bolilla_b6,post_b1,post_b2,post_b3,post_b4,post_b5,post_b6
0,17/09/2023,1021,36 05 47 46 07 08,14,02 01 18,Promoción Sí o Sí,2023-09-17,"[05, 07, 08, 36, 46, 47]","[5, 7, 8, 36, 46, 47]",5,...,,,,,,,,,,
1,13/09/2023,1020,42 28 41 48 24 19,11,32 10,Promoción Sí o Sí,2023-09-13,"[19, 24, 28, 41, 42, 48]","[19, 24, 28, 41, 42, 48]",19,...,8.0,36.0,46.0,47.0,5.0,7.0,8.0,36.0,46.0,47.0
2,10/09/2023,1019,34 41 47 07 20 03,16,22,Promoción Sí o Sí,2023-09-10,"[03, 07, 20, 34, 41, 47]","[3, 7, 20, 34, 41, 47]",3,...,28.0,41.0,42.0,48.0,19.0,24.0,28.0,41.0,42.0,48.0
3,06/09/2023,1018,28 29 46 27 48 06,8,47 14,Promoción Sí o Sí,2023-09-06,"[06, 27, 28, 29, 46, 48]","[6, 27, 28, 29, 46, 48]",6,...,20.0,34.0,41.0,47.0,3.0,7.0,20.0,34.0,41.0,47.0
4,03/09/2023,1017,09 24 38 33 29 46,26,34 15 42,Promoción Sí o Sí,2023-09-03,"[09, 24, 29, 33, 38, 46]","[9, 24, 29, 33, 38, 46]",9,...,28.0,29.0,46.0,48.0,6.0,27.0,28.0,29.0,46.0,48.0
5,30/08/2023,1016,01 33 28 25 12 44,15,07 10,Promoción Sí o Sí,2023-08-30,"[01, 12, 25, 28, 33, 44]","[1, 12, 25, 28, 33, 44]",1,...,29.0,33.0,38.0,46.0,9.0,24.0,29.0,33.0,38.0,46.0
6,27/08/2023,1015,20 22 38 28 16 14,36,34 43 08,Promoción Sí o Sí,2023-08-27,"[14, 16, 20, 22, 28, 38]","[14, 16, 20, 22, 28, 38]",14,...,25.0,28.0,33.0,44.0,1.0,12.0,25.0,28.0,33.0,44.0
7,23/08/2023,1014,32 43 38 25 24 11,20,34 06,Promoción Sí o Sí,2023-08-23,"[11, 24, 25, 32, 38, 43]","[11, 24, 25, 32, 38, 43]",11,...,20.0,22.0,28.0,38.0,14.0,16.0,20.0,22.0,28.0,38.0
8,20/08/2023,1013,43 25 27 34 06 07,31,09 35,Promoción Sí o Sí,2023-08-20,"[06, 07, 25, 27, 34, 43]","[6, 7, 25, 27, 34, 43]",6,...,25.0,32.0,38.0,43.0,11.0,24.0,25.0,32.0,38.0,43.0
9,16/08/2023,1012,37 19 13 26 45 02,32,25 33 38,Promoción Sí o Sí,2023-08-16,"[02, 13, 19, 26, 37, 45]","[2, 13, 19, 26, 37, 45]",2,...,25.0,27.0,34.0,43.0,6.0,7.0,25.0,27.0,34.0,43.0


In [38]:
class GBDF:
  bdf_count: pd.DataFrame
  bdf_stats: pd.Series

  def __init__(self, bdf_count: pd.DataFrame, bdf_stats: pd.Series) -> None:
    self.bdf_count = bdf_count
    self.bdf_stats = bdf_stats

def percentile(n):
  def percentile_(x):
    return x.quantile(n)
  percentile_.__name__ = 'percentile_{:02.0f}'.format(n*100)
  return percentile_

def get_type_class (row):
  if (row['is_class_A'] == True):
    return 'A'
  if (row['is_class_B'] == True):
    return 'B'
  if (row['is_class_C'] == True):
    return 'C'
  if (row['is_class_D'] == True):
    return 'D'

  return 'NONE'

def generate_bdf_count (b_col: str, bdf:pd.DataFrame, total_rows = 1000, save = False):
  bdf_count = bdf.groupby([b_col]).agg(b_count=pd.NamedAgg(column=b_col, aggfunc="count")).reset_index()
  bdf_stats = bdf_count['b_count'].agg([np.sum, np.mean, np.std, np.median,
                      np.var, np.min, np.max, percentile(0.25), percentile(0.5), percentile(0.75)])
  # * class_A = 100 - percentile_75
  # * class_B = 75 - percentile_50
  # * class_C = 50 - percentile_25
  # * class_D = 25 - percentile_0

  bdf_count.sort_values('b_count', ascending=False, inplace=True)
  bdf_count.reset_index(drop=True, inplace=True)
  
  bdf_count['porc_real'] = (bdf_count[['b_count']] / total_rows) * 100
  bdf_count['is_class_A'] = bdf_count['b_count'].between(bdf_stats['percentile_75'], bdf_stats['max']+1, inclusive='left')
  bdf_count['is_class_B'] = bdf_count['b_count'].between(bdf_stats['percentile_50'], bdf_stats['percentile_75'], inclusive='left')
  bdf_count['is_class_C'] = bdf_count['b_count'].between(bdf_stats['percentile_25'], bdf_stats['percentile_50'], inclusive='left')
  bdf_count['is_class_D'] = bdf_count['b_count'].between(bdf_stats['min'], bdf_stats['percentile_25'], inclusive='left')
  bdf_count['type_class'] = bdf_count.apply (lambda row: get_type_class(row), axis=1)
  bdf_count.drop(['is_class_A', 'is_class_B', 'is_class_C', 'is_class_D'], axis=1, inplace=True)
  if save:
    bdf_count.to_excel(f'bdf_count_{b_col}.xlsx')

  return GBDF(bdf_count=bdf_count, bdf_stats=bdf_stats)




def generate_bdfcount_store (bdf:pd.DataFrame, total_rows = 1000, save = False):
  # bdf = dfRows[['iso_fecha', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6']]
  cols = ['b1', 'b2', 'b3', 'b4', 'b5', 'b6']
  result : Dict[str, GBDF] = {}
  for col in cols:
    result[col] = generate_bdf_count(b_col=col, bdf=bdf, total_rows=total_rows, save=save)
  return result


In [39]:

def check_porc (b_col: str, df:pd.DataFrame, total_rows = 1000, start_from = 0, save = False):
  df['post_bolilla'] = df[b_col].shift(1)
  bdf = df[start_from:(total_rows+start_from)]

  display(bdf)

  store_bdf = generate_bdfcount_store(bdf=bdf, total_rows=total_rows, save=save)  
  bdf_count = store_bdf[b_col].bdf_count

  if start_from > 0:
    curent_row = df.iloc[start_from]
    curent_row_bolilla = curent_row[b_col]
    display(bdf_count[bdf_count[b_col] == curent_row_bolilla])

    next_row = df.iloc[start_from - 1]
    next_row_bolilla = next_row[b_col]
    display(bdf_count[bdf_count[b_col] == next_row_bolilla])

    # display(bdf[bdf[b_col] == curent_row_bolilla])
    # display(bdf[bdf[b_col] == curent_row_bolilla].index -1)
    # display(bdf.iloc[bdf[bdf[b_col] == curent_row_bolilla].index -1])
  else:
    print("Post bolillas")
    curent_row = df.iloc[start_from]
    curent_row_bolilla = curent_row[b_col]
    print(curent_row_bolilla)
    bdf_post_bolillas = bdf[bdf[b_col] == curent_row_bolilla][1:]
    bdf_count_post_bolillas = bdf_post_bolillas.groupby(['post_bolilla']).agg(b_count=pd.NamedAgg(column=b_col, aggfunc="count")).reset_index()
    
    bdf_count_post_bolillas.sort_values('b_count', ascending=False, inplace=True)
    bdf_count_post_bolillas.reset_index(drop=True, inplace=True)
    # print("bdf_count_post_bolillas")
    # display(bdf_count_post_bolillas)

    # result = pd.concat([bdf_count_post_bolillas, bdf_count], axis=1, join="inner")
    result = bdf_count_post_bolillas.merge(bdf_count, left_on='post_bolilla', right_on=b_col, how='inner')
    display(result[['post_bolilla','b_count_x','type_class']])

    display(result[['b_count_x','type_class']].agg({'b_count_x': 'sum'}))

    # print("bdf_count")
    # display(bdf_count)'})


  display(bdf_count)
  display(store_bdf[b_col].bdf_stats)

check_porc(b_col='b2', df=dfRows[['iso_fecha', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6']], save=False, start_from=0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['post_bolilla'] = df[b_col].shift(1)


Unnamed: 0,iso_fecha,b1,b2,b3,b4,b5,b6,post_bolilla
0,2023-09-17,05,07,08,36,46,47,
1,2023-09-13,19,24,28,41,42,48,07
2,2023-09-10,03,07,20,34,41,47,24
3,2023-09-06,06,27,28,29,46,48,07
4,2023-09-03,09,24,29,33,38,46,27
...,...,...,...,...,...,...,...,...
995,2013-11-10,17,18,21,23,30,34,09
996,2013-11-06,02,25,26,33,37,38,18
997,2013-11-03,08,10,19,30,31,41,25
998,2013-10-30,03,16,31,39,40,41,10


Post bolillas
07


Unnamed: 0,post_bolilla,b_count_x,type_class
0,10,5,A
1,7,4,B
2,24,3,C
3,21,3,C
4,5,3,A
5,3,2,B
6,13,2,A
7,20,2,C
8,16,2,A
9,14,2,A


b_count_x    40
dtype: int64

Unnamed: 0,b2,b_count,porc_real,type_class
0,8,62,6.2,A
1,13,58,5.8,A
2,9,56,5.6,A
3,11,53,5.3,A
4,10,52,5.2,A
5,15,51,5.1,A
6,16,50,5.0,A
7,5,47,4.7,A
8,14,46,4.6,A
9,12,46,4.6,A


sum              1000.000000
mean               29.411765
std                19.545003
median             30.500000
var               382.007130
min                 1.000000
max                62.000000
percentile_25      13.000000
percentile_50      30.500000
percentile_75      46.000000
Name: b_count, dtype: float64