# **import**

In [1]:
!pip install elasticsearch



Collecting elasticsearch
  Downloading elasticsearch-8.15.1-py3-none-any.whl.metadata (8.7 kB)
Collecting elastic-transport<9,>=8.13 (from elasticsearch)
  Downloading elastic_transport-8.15.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.15.1-py3-none-any.whl (524 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.6/524.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.15.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.15.1 elasticsearch-8.15.1


In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def book_parser(line):
  data = {
      'book_id': line['book_id'],
      'title_without_series': line['title_without_series'],
      'book_description': line['description'],
      'publication_year': line['publication_year'],
      'publisher': line['publisher'],
      'ratings_count': line['ratings_count'],
      'book_average_rating': line['average_rating'],
      'cover_page': line['image_url'],
      'book_url': line['url'],
      'is_ebook': line['is_ebook'],
      'num_pages': line['num_pages'],
  }
  return data

In [5]:
df_books = pd.read_csv('/content/drive/MyDrive/FatimaEzz/REC_SYS/goodreads/CSV/bd-book1.csv')

In [6]:
df_books

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,1987.0,"Nelson Doubleday, Inc.",140,4.03,https://images.gr-assets.com/books/1304100136m...,https://www.goodreads.com/book/show/7327624-th...,False,600.0
1,6066819,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,2009.0,Atria Books,51184,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/6066819-be...,False,368.0
2,287141,The Aeneid for Boys and Girls,"Relates in vigorous prose the tale of Aeneas, ...",2006.0,Yesterday's Classics,46,4.13,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/287141.The...,False,162.0
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,"To Kara's astonishment, she discovers that a p...",2009.0,Seven Seas,98,4.22,https://images.gr-assets.com/books/1316637798m...,https://www.goodreads.com/book/show/6066812-al...,False,216.0
4,287149,The Devil's Notebook,"Wisdom, humor, and dark observations by the fo...",2000.0,Feral House,986,3.81,https://images.gr-assets.com/books/1328768789m...,https://www.goodreads.com/book/show/287149.The...,False,147.0
...,...,...,...,...,...,...,...,...,...,...,...
5108,9440162,The Noble Pirates,"A vacation in the Bahamas goes awry, and a wom...",2011.0,Rima Jean,94,3.66,https://images.gr-assets.com/books/1297461805m...,https://www.goodreads.com/book/show/9440162-th...,False,352.0
5109,15015624,Under Shifting Glass,Jess has a secret: a mysterious glass flask sh...,2013.0,Chronicle Books,197,3.61,https://images.gr-assets.com/books/1351683388m...,https://www.goodreads.com/book/show/15015624-u...,False,320.0
5110,26247022,"Battle Rabbits, Vol. 1",Get Ready to Hop Into Battle!\nKokuryuu Kaguya...,2016.0,Seven Seas,25,3.64,https://images.gr-assets.com/books/1467587295m...,https://www.goodreads.com/book/show/26247022-b...,False,180.0
5111,10603389,Captive Heart,Emmalyne Madsen sends a desperate plea heavenw...,2011.0,Covenant Communications,914,3.90,https://images.gr-assets.com/books/1302803291m...,https://www.goodreads.com/book/show/10603389-c...,False,255.0


In [7]:
# Prétraitement des données
def clean_title(title):
    if pd.isna(title):
        return ""
    if not isinstance(title, str):
        return ""  # Assurer que title est une chaîne de caractères
    title = title.lower()  # Convertir en minuscules
    title = re.sub(r'[^\w\s]', '', title)  # Supprimer les caractères spéciaux
    title = re.sub(r'\s+', ' ', title)  # Remplacer les espaces multiples par un espace unique
    return title.strip()

# Nettoyer les titres
df_books['cleaned_title'] = df_books['title_without_series'].apply(clean_title)

# Supprimer les lignes avec des valeurs manquantes dans les colonnes importantes
df_books.dropna(subset=['title_without_series', 'publication_year', 'publisher', 'book_average_rating', 'cover_page', 'book_url'], inplace=True)

# Convertir les colonnes numériques en types appropriés
df_books['book_average_rating'] = pd.to_numeric(df_books['book_average_rating'], errors='coerce').fillna(0)
df_books['publication_year'] = df_books['publication_year'].astype(str)  # Assurer que l'année est en format chaîne de caractères

In [8]:
df_books[['title_without_series']].duplicated().sum()
df_books[['book_average_rating']].duplicated().sum()
df_books.drop_duplicates(subset=['book_average_rating','title_without_series'], keep='first', inplace=True)
df_books.shape


(3544, 12)

In [9]:
from elasticsearch import Elasticsearch, helpers

# Utiliser l'URL publique de ngrok
es = Elasticsearch("https://bcb5-197-146-179-37.ngrok-free.app",  timeout=60,  # Ajustez le timeout selon vos besoins
    max_retries=10,
    retry_on_timeout=True)

# Test de la connexion
try:
    print(es.info())
    print("Connexion réussie à Elasticsearch via ngrok")
except Exception as e:
    print(f"Erreur de connexion : {e}")


  es = Elasticsearch("https://bcb5-197-146-179-37.ngrok-free.app",  timeout=60,  # Ajustez le timeout selon vos besoins


{'name': '98bf70a15ca2', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'T64X5RUUSSuTZwyN7Pi0WA', 'version': {'number': '8.1.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd0925dd6f22e07b935750420a3155db6e5c58381', 'build_date': '2022-03-17T22:01:32.658689558Z', 'build_snapshot': False, 'lucene_version': '9.0.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Connexion réussie à Elasticsearch via ngrok


In [10]:
# Définir le mapping pour l'index Elasticsearch
index_name = 'books'

# Supprimer l'index s'il existe déjà
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Créer un nouvel index avec le mapping pour les suggestions
mapping = {
    "mappings": {
        "properties": {
            "title_without_series": {
                "type": "completion"
            },
            "publication_year": {
                "type": "keyword"
            },
            "publisher": {
                "type": "keyword"
            },
            "book_average_rating": {
                "type": "float"
            },
            "cover_page": {
                "type": "keyword"
            },
            "book_url": {
                "type": "keyword"
            }
        }
    }
}

# Créer l'index avec le mapping
es.indices.create(index=index_name, body=mapping)

# Vérifier le mapping
print("Vérification du mapping :")
print(es.indices.get_mapping(index=index_name))


Vérification du mapping :
{'books': {'mappings': {'properties': {'book_average_rating': {'type': 'float'}, 'book_url': {'type': 'keyword'}, 'cover_page': {'type': 'keyword'}, 'publication_year': {'type': 'keyword'}, 'publisher': {'type': 'keyword'}, 'title_without_series': {'type': 'completion', 'analyzer': 'simple', 'preserve_separators': True, 'preserve_position_increments': True, 'max_input_length': 50}}}}}


In [11]:
from elasticsearch.helpers import bulk

# Diviser les données en lots de 500 par exemple
batch_size = 500
for i in range(0, len(df_books), batch_size):
    batch_actions = [
        {
            "_index": index_name,
            "_id": row["book_id"],
            "_source": {
                "title_without_series": row["cleaned_title"],  # Utiliser le titre nettoyé pour la suggestion
                "publication_year": row["publication_year"],
                "publisher": row["publisher"],
                "book_average_rating": row["book_average_rating"],
                "cover_page": row["cover_page"],
                "book_url": row["book_url"]
            }
        }
        for _, row in df_books.iloc[i:i + batch_size].iterrows()
    ]

    # Insérer chaque batch dans Elasticsearch
    bulk(es, batch_actions)
    print(f"Batch {i // batch_size + 1} indexé avec succès")

print("Indexation complète réussie")


Batch 1 indexé avec succès
Batch 2 indexé avec succès
Batch 3 indexé avec succès
Batch 4 indexé avec succès
Batch 5 indexé avec succès
Batch 6 indexé avec succès
Batch 7 indexé avec succès
Batch 8 indexé avec succès
Indexation complète réussie


In [14]:
import pandas as pd

# Fonction pour rendre les URLs cliquables
def make_clickable(url):
    return f'<a target="_blank" href="{url}">{url}</a>'

# Fonction pour afficher les images
def show_image(url):
    return f'<img src="{url}" width="60">'

# Fonction d'auto-complétion
def autocomplete_title(user_input):
    # Requête d'auto-complétion à Elasticsearch
    response = es.search(
        index="books",
        body={
            "suggest": {
                "book-title-suggest": {
                    "prefix": user_input,
                    "completion": {
                        "field": "title_without_series",
                        "fuzzy": {
                            "fuzziness": 2
                        }
                    }
                }
            }
        }
    )

    # Extraire les suggestions
    suggestions = response['suggest']['book-title-suggest'][0]['options']

    # Créer un DataFrame pour afficher les résultats
    data = []
    for suggestion in suggestions:
        source = suggestion['_source']
        data.append({
            'Title': source['title_without_series'],
            'Publication Year': source['publication_year'],
            'Publisher': source['publisher'],
            'Average Rating': source['book_average_rating'],
            'Cover Page': source['cover_page'],
            'Book URL': source['book_url']
        })

    # Convertir la liste de données en DataFrame
    df = pd.DataFrame(data)

    # Appliquer le formatage pour rendre les liens cliquables et afficher les images
    styled_df = df.head(5).style.format({
        'Book URL': make_clickable,
        'Cover Page': show_image
    })

    # Cacher l'index
    return styled_df.hide(axis='index')

# Exemple d'utilisation de l'auto-complétion
user_input = input("Commencez à taper le titre du livre : ")
autocomplete_title(user_input)


Commencez à taper le titre du livre : The Aeneid for Boys


Title,Publication Year,Publisher,Average Rating,Cover Page,Book URL
the aeneid for boys and girls,2006.0,Yesterday's Classics,4.13,,https://www.goodreads.com/book/show/287141.The_Aeneid_for_Boys_and_Girls
