In [20]:
import pandas as pd
from random import shuffle

articles_df = pd.read_csv('./test_data/liste_des_articles.csv')
searches_df = pd.read_csv('./test_data/liste_des_recherches.csv')
articles_df.fillna("", inplace=True)
searches_df.fillna("", inplace=True)

def get_elem_from_serie(line, titles):
  res = {}
  for key in titles: res[key] = line[key]
  return res

In [21]:

article_titles = ["id", "name", "description"]
articles_list = [get_elem_from_serie(line, article_titles) for (_, line) in articles_df.iterrows()]
# articles_list

In [26]:

searches_titles = ["id", "search_text", "search_date", "clicked_article_1", "clicked_article_2", "clicked_article_3"]
searches_list = [get_elem_from_serie(line, searches_titles) for (_, line) in searches_df.iterrows() if line["clicked_article_1"]]
searches_list

[{'id': 91,
  'search_text': 'soja',
  'search_date': '2021-08-10 07:56:00.621380+00:00',
  'clicked_article_1': 'Télévision LG OLED',
  'clicked_article_2': '',
  'clicked_article_3': ''},
 {'id': 83,
  'search_text': 'casque',
  'search_date': '2021-08-10 07:50:37.331859+00:00',
  'clicked_article_1': 'Farine de Soja',
  'clicked_article_2': 'CHARGEUR adaptateur secteur pour Toshiba Satellite A600 A660 A665 A80 C645 C645D C650',
  'clicked_article_3': ''},
 {'id': 82,
  'search_text': 'tv',
  'search_date': '2021-08-10 07:50:23.962617+00:00',
  'clicked_article_1': "TOSHIBA 50UL3B63DG TV LED UHD 4K - 5' (126 cm) - Smart TV - Bluetooth - 4 x HDMI - 2 X USB",
  'clicked_article_2': 'Cable HDMI pour TV',
  'clicked_article_3': 'Meuble TV'},
 {'id': 52,
  'search_text': 'samsung',
  'search_date': '2021-08-09 16:12:06.915893+00:00',
  'clicked_article_1': 'Téléphone Samsung Galaxy Note 20 Ultra 5G',
  'clicked_article_2': '',
  'clicked_article_3': ''},
 {'id': 34,
  'search_text': 'tv',

In [24]:
def default_model(text, articles_list, searches_list):
  def ranker(article):
    name = article['name']
    found = text in name
    return found
  # shuffle(articles_list)
  # return sorted(articles_list, key=ranker, reverse=True)

  return list(filter(ranker, articles_list))

In [25]:
search_result = default_model("TV", articles_list, searches_list)

search_result
# print("search_result[0]: ", search_result[0])
# print("search_result[1]: ", search_result[1])
# print("search_result[2]: ", search_result[2])

[{'id': 120,
  'name': 'Cable HDMI pour TV',
  'description': 'mini HDMI to HDMI Cable, 6ft (~2m) – Simply NUC'},
 {'id': 121,
  'name': 'Meuble TV',
  'description': 'Meuble TV ADDISON , chêne clair'},
 {'id': 96,
  'name': "TCL 65QLED790 - TV QLED UHD 4K - 65' (165cm) - Dolby",
  'description': "Format d'affichage : 4K UHD (2160p) - Résolution : 3840 x 2160 - Tuner TV numérique : DVB-C, DVB-S2, DVB-T2"},
 {'id': 97,
  'name': "TOSHIBA 50UL3B63DG TV LED UHD 4K - 5' (126 cm) - Smart TV - Bluetooth - 4 x HDMI - 2 X USB",
  'description': "Format d'affichage : 4K UHD (2160p) - Résolution : 3840 x 2160 - Tuner TV numérique : DVB-C, DVB-S, DVB-S2, DVB-T, DVB-T2"},
 {'id': 119,
  'name': 'Télécommande TV',
  'description': 'Télécommande universelle 12 en 1 HDEO Multi fonction DVD,TV,SAT,VCR'}]

In [37]:
def get_article_freq_for_text(text, article, searches_list):
  def isin(s):
    return s["search_text"] == text
  searches_with_text = list(filter(isin, searches_list))
  n = len(searches_with_text)
  if n == 0:
    # TODO Instead of returning 0, find the nearest text and use it
    return 0
  def has_article(s):
    return s["clicked_article_1"] == article["name"]
  searches_with_text_with_article = list(filter(has_article, searches_with_text))
  return len(searches_with_text_with_article) / n

def learn_model(text, articles_list, searches_list):
  def ranker(article):
    f = get_article_freq_for_text(text, article, searches_list)
    if f > 0: article['is_concerned'] = 1
    # return f > 0
    return f
  shuffle(articles_list)
  return sorted(articles_list, key=ranker, reverse=True)
  # return list(filter(ranker, articles_list))

  # freq = [get_article_freq_for_text(text, article, searches_list) for article in articles_list]
  # return freq

In [40]:
l_search_result = learn_model("tv", articles_list, searches_list)

l_search_result
# print("l_search_result[0]: ", l_search_result[0])
# print("l_search_result[1]: ", l_search_result[1])
# print("l_search_result[2]: ", l_search_result[2])

[{'id': 97,
  'name': "TOSHIBA 50UL3B63DG TV LED UHD 4K - 5' (126 cm) - Smart TV - Bluetooth - 4 x HDMI - 2 X USB",
  'description': "Format d'affichage : 4K UHD (2160p) - Résolution : 3840 x 2160 - Tuner TV numérique : DVB-C, DVB-S, DVB-S2, DVB-T, DVB-T2",
  'is_concerned': 1},
 {'id': 89,
  'name': 'Réfrigérateur congélateur bas',
  'description': '',
  'is_concerned': 1},
 {'id': 119,
  'name': 'Télécommande TV',
  'description': 'Télécommande universelle 12 en 1 HDEO Multi fonction DVD,TV,SAT,VCR'},
 {'id': 91,
  'name': 'Manette PS4',
  'description': 'Dual Shock 4.0 V2 JetBlack - PlayStation Officiel'},
 {'id': 106, 'name': 'Casque Pionner Noir', 'description': ''},
 {'id': 121,
  'name': 'Meuble TV',
  'description': 'Meuble TV ADDISON , chêne clair'},
 {'id': 117, 'name': 'Ventilateur Design Blanc', 'description': ''},
 {'id': 87, 'name': 'Lave vaisselle', 'description': ''},
 {'id': 118, 'name': 'Ventilateur Design Bois', 'description': ''},
 {'id': 111, 'name': 'Téléphone Xia