### Connect to Elasticsearch

In [3]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

print(es.ping())

True


### Create an Index

#### What is an Index in Elasticsearch?
An index in Elasticsearch is similar to a database in a relational database system. It is a collection of documents that share similar characteristics. Each document is stored as a JSON object and has a unique identifier.

In [4]:
index_name = "test_index"

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

Index 'test_index' created.


In [5]:
doc = {
    "name": "John Doe",
    "age": 30,
    "occupation": "Software Engineer",
    "location": "San Francisco"
}

res = es.index(index=index_name, id=1, document=doc)
print(res)

{'_index': 'test_index', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


In [42]:
query = {
    "query": {
        "match": {
            "occupation": "Software Engineer"
        }
    }
}

res = es.search(index=index_name, body=query)
print(res)


{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}


In [43]:
update_query = {
    "doc": {
        "age": 31
    }
}

es.update(index=index_name, id=1, body=update_query)

ObjectApiResponse({'_index': 'test_index', '_id': '1', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

In [44]:
es.delete(index=index_name, id=1)


ObjectApiResponse({'_index': 'test_index', '_id': '1', '_version': 3, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1})

In [6]:
aggregation_query = {
    "size": 0,
    "aggs": {
        "average_age": {
            "avg": {
                "field": "age"
            }
        }
    }
}

res = es.search(index=index_name, body=aggregation_query)
print(res)

{'took': 10, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': None, 'hits': []}, 'aggregations': {'average_age': {'value': 30.0}}}


In [7]:
from elasticsearch.helpers import bulk

# Bulk inserting multiple documents
actions = [
    {"_index": index_name, "_id": i, "_source": {"name": f"User {i}", "age": 25 + i, "occupation": "Developer"}}
    for i in range(2, 10)
]

bulk(es, actions)


(8, [])

### What is a Shard in Elasticsearch?
A shard is a subdivision of an index. Each index is split into multiple shards for better performance and fault tolerance.

Primary Shards: Store the actual data.
Replica Shards: Backup copies of primary shards for redundancy.
Each shard is an independent Lucene index that can be stored on a different node in a cluster.

Example: How Shards Improve Performance
Imagine we have 1 million documents in an index. Instead of storing all of them in a single server, we can distribute them across multiple shards, which can then be placed on different servers (nodes).

Creating an Index with Shards and Replicas

In [8]:
sharded_mapping = {
    "settings": {
        "number_of_shards": 3,  # Multiple shards for parallel processing
        "number_of_replicas": 1
    },
}

if es.indices.exists(index='sharded_index'):
    es.indices.delete(index='sharded_index')  # Delete existing index to avoid conflicts
    print(f"Deleted existing index: sharded_index")

es.indices.create(index='sharded_index', body=sharded_mapping)
print(f"Created index: sharded_index")


Created index: sharded_index


Creating an Index with no Shards or Replicas

In [9]:
unsharded_mapping = {
    "settings": {
        "number_of_shards": 1,  # Multiple shards for parallel processing
        "number_of_replicas": 1
    },
}

if es.indices.exists(index='unsharded_index'):
    es.indices.delete(index='unsharded_index')  # Delete existing index to avoid conflicts
    print(f"Deleted existing index: unsharded_index")

es.indices.create(index='unsharded_index', body=unsharded_mapping)
print(f"Created index: unsharded_index")


Created index: unsharded_index


In [10]:
import pandas as pd
from elasticsearch.helpers import bulk

df = pd.read_csv("crime_data.csv")
df.fillna("", inplace=True)

actions_unsharded_index = [
    {
        "_index": 'unsharded_index',
        "_id": row["DR_NO"],
        "_source": row.to_dict()
    }
    for _, row in df.iterrows()
]

actions_sharded_index = [
    {
        "_index": 'sharded_index',
        "_id": row["DR_NO"],
        "_source": row.to_dict()
    }
    for _, row in df.iterrows()
]

success, failed = bulk(es, actions_unsharded_index)

  df = pd.read_csv("crime_data.csv")
  df.fillna("", inplace=True)


BulkIndexError: 4 document(s) failed to index.

In [None]:
success, failed = bulk(es, actions_sharded_index)

### Mapping
Mapping is the process of defining how documents and their fields are stored and indexed in Elasticsearch.

It defines data types (e.g., text, keyword, date) and specifies how data should be interpreted.

Mapping is crucial for accurate search results, performance optimization, and data integrity.

#### Dynamic Mapping
Dynamic mapping allows Elasticsearch to automatically detect and assign data types for new fields.

This is useful for rapidly changing or unpredictable data structures.

Example :

In [11]:
doc = {
    "title": "Learning Elasticsearch",
    "pages": 320,
    "price": 29.99,
    "published": "2024-03-01"
}

# Index the document
es.index(index='books', id=1, document=doc)

# View the generated mapping
mapping = es.indices.get_mapping(index='books')
print("Dynamic Mapping Result:", mapping)

Dynamic Mapping Result: {'books': {'mappings': {'properties': {'pages': {'type': 'long'}, 'price': {'type': 'float'}, 'published': {'type': 'date'}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}


#### Explicit Mapping
Explicit mapping gives you greater control by specifying field types and properties upfront.

This approach helps prevent incorrect data types and ensures consistent data handling.

In [None]:
# Define explicit mapping
explicit_mapping = {
    "mappings": {
        "properties": {
            "title": { "type": "text" },
            "author": { "type": "keyword" },
            "published": { "type": "date", "format": "yyyy-MM-dd" },
            "price": { "type": "float" },
            "tags": { "type": "keyword" }
        }
    }
}

# Create index with custom mapping
es.indices.create(index='library', body=explicit_mapping)

# Index a sample document
es.index(index='library', id=1, document={
    "title": "Mastering Elasticsearch",
    "author": "John Doe",
    "published": "2024-03-01",
    "price": 49.99,
    "tags": ["search", "elasticsearch", "big data"]
})


# Text Analysis

Elasticsearch provides powerful text analysis using analyzers, which process text for indexing and searching. It involves:

 - Character filter: Mainly used to strip off some unused characters or change some characters.
 - Tokenizer: Breaks a text into individual tokens(or words) and it does that based on certain factors(whitespace, ngram etc).
 - Token filter: It receives the tokens and then apply some filters(example changing uppercase terms to lowercase).

In [None]:


query = {
  "analyzer": "standard",
  "text": "Running quickly through the fields."
}
res = es.indices.analyze(body=query)
tokens = list(map(lambda n: n["token"], res["tokens"]))
print(tokens)





In [None]:
# Define custom analyzer
new_analyser = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": { 
          "char_filter": ["emoticons"],
          "tokenizer": "punctuation",
          "filter": ["lowercase", "english_stop"]
        }
      },
      "tokenizer": {
        "punctuation": { 
          "type": "pattern",
          "pattern": "[\\s.,!?]+"  
        }
      },
      "char_filter": {
        "emoticons": { 
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      },
      "filter": {
        "english_stop": { 
          "type": "stop",
          "stopwords": "_english_"
        }  #It removes common English stop words (e.g., "the", "and", "is", "of", etc.). After tokenisation
      }
    }
  }
}

# Delete index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with custom analyzer
es.indices.create(index=index_name, body=new_analyser)
print(f"Index '{index_name}' created successfully!")

query_new_analyser = {
  "analyzer": "my_custom_analyzer",
  "text": "I'm a :) person, and you?"
}

# Apply text analysis
res = es.indices.analyze(index=index_name, body=query_new_analyser)

# Extract and print tokens
tokens = [token["token"] for token in res["tokens"]]
print(tokens)


# Query DSL
Le Query DSL d'Elasticsearch (Domain Specific Language) est un langage puissant et flexible utilisé pour effectuer des requêtes de recherche. Il permet de rechercher, de filtrer et de trier les données dans Elasticsearch de manière structurée. 

In [13]:
from datetime import datetime
import random

# 1. CRÉATION D'UN JEU DE DONNÉES SIMPLE
# =======================================

def create_simple_dataset(n=20):
    """Crée un dataset simple d'articles pour Elasticsearch"""
    
    # Listes pour générer des données aléatoires
    topics = ["Elasticsearch", "Python", "Data Science", "NoSQL", "Database"]
    tags = ["search", "database", "python", "tutorial", "analytics"]
    statuses = ["published", "draft", "archived"]
    
    articles = []
    
    for i in range(n):
        # Générer un titre
        primary_topic = random.choice(topics)
        title = f"Guide {primary_topic} pour débutants"
        
        # Générer des tags (2-3 tags par article)
        num_tags = random.randint(2, 3)
        article_tags = random.sample(tags, num_tags)
        
        # Générer une date
        year = random.randint(2020, 2024)
        month = random.randint(1, 12)
        day = random.randint(1, 28)
        date = f"{year}-{month:02d}-{day:02d}"
        
        # Générer un contenu court
        content = f"Cet article explique les bases de {primary_topic}. "
        content += f"Vous apprendrez comment utiliser {primary_topic} pour l'analyse de données. "
        content += f"Des exemples en code sont fournis pour illustrer les concepts."
        
        # Créer l'article
        article = {
            "id": i + 1,
            "title": title,
            "content": content,
            "author": f"author_{random.randint(1, 5)}",
            "date": date,
            "tags": article_tags,
            "rating": round(random.uniform(1.0, 5.0), 1),
            "status": random.choice(statuses),
            "views": random.randint(100, 1000)
        }
        
        articles.append(article)
    
    return pd.DataFrame(articles)

In [25]:
articles_df = create_simple_dataset(20)
print("\nJeu de données d'articles (5 premiers):")
print(articles_df[["id", "title", "tags", "rating", "status"]].head())


Jeu de données d'articles (5 premiers):
   id                               title                   tags  rating  \
0   1   Guide Data Science pour débutants     [database, python]     3.7   
1   2          Guide NoSQL pour débutants     [search, tutorial]     3.0   
2   3   Guide Data Science pour débutants   [tutorial, database]     3.9   
3   4          Guide NoSQL pour débutants       [search, python]     2.4   
4   5  Guide Elasticsearch pour débutants  [database, analytics]     1.5   

      status  
0   archived  
1  published  
2  published  
3   archived  
4   archived  


In [26]:
# 2. INDEXATION DANS ELASTICSEARCH
# ================================

def setup_elasticsearch_index(articles_df):
    """Crée l'index et indexe les articles"""
    
    # Mapping pour l'index des articles
    articles_mapping = {
        "mappings": {
            "properties": {
                "id": {"type": "integer"},
                "title": {"type": "text", "analyzer": "standard"},
                "content": {"type": "text", "analyzer": "standard"},
                "author": {"type": "keyword"},
                "date": {"type": "date", "format": "yyyy-MM-dd"},
                "tags": {"type": "keyword"},
                "rating": {"type": "float"},
                "status": {"type": "keyword"},
                "views": {"type": "integer"}
            }
        }
    }
    
    # Supprimer l'index s'il existe
    if es.indices.exists(index="articles"):
        es.indices.delete(index="articles")
    
    # Créer l'index avec le mapping
    es.indices.create(index="articles", body=articles_mapping)
    
    # Indexer les articles
    print("Indexation des articles...")
    for _, article in articles_df.iterrows():
        es.index(index="articles", id=article["id"], document=article.to_dict())
    
    # Rafraîchir l'index
    es.indices.refresh(index="articles")
    
    print(f"Indexation terminée ! {len(articles_df)} articles indexés.")

In [27]:
setup_elasticsearch_index(articles_df)

Indexation des articles...
Indexation terminée ! 20 articles indexés.


Dans le Query DSL d'Elasticsearch, les clauses must et should sont utilisées dans les requêtes booléennes (bool)

- Must: La condition spécifiée dans must doit être satisfaite pour que le document soit retourné dans les résultats de recherche. Les documents qui ne remplissent pas cette condition seront exclus des résultats.
- Should: Les conditions dans should sont facultatives, mais elles favorisent un document si elles sont remplies. Si un document ne remplit aucune des conditions should, il peut quand même être retourné si la condition must est remplie.

On peux aussi utiliser les deux clauses dans une même requête, ce qui permet de combiner des conditions obligatoires (must) et des conditions facultatives (should). Ce type de requête est très utilisé pour effectuer des recherches plus complexes, où certaines conditions sont essentielles (via must), tandis que d'autres conditions sont facultatives mais améliorent le classement des résultats (via should).

In [30]:
# 3. REQUÊTES COMPOSÉES (BOOL QUERY)
# ==================================

def bool_query_example():
    """Exemple de requête booléenne composée"""
    query = {
        "query": {
            "bool": {
                "must": [
                    {"match": {"content": "elasticsearch"}}
                ],
                "should": [
                    {"match": {"tags": "python"}},
                    {"match": {"tags": "tutorial"}}
                ],
                "must_not": [
                    {"match": {"status": "archived"}}
                ]
            }
        }
    }
    
    results = es.search(index="articles", body=query)
    
    print(f"Résultats de la requête booléenne composée (Bool Query):")
    print(f"Nombre de résultats: {results['hits']['total']['value']}")
    
    for hit in results['hits']['hits']:
        source = hit['_source']
        print(f"\nID: {source['id']}")
        print(f"Titre: {source['title']}")
        print(f"Contenu: {source['content'][:50]}...")
        print(f"Tags: {', '.join(source['tags'])}")
        print(f"Statut: {source['status']}")
        print(f"Note: {source['rating']}")
    
    return results

In [31]:
print("\n--- REQUÊTES COMPOSÉES (BOOL QUERY) ---")
bool_results = bool_query_example()


--- REQUÊTES COMPOSÉES (BOOL QUERY) ---
Résultats de la requête booléenne composée (Bool Query):
Nombre de résultats: 1

ID: 10
Titre: Guide Elasticsearch pour débutants
Contenu: Cet article explique les bases de Elasticsearch. V...
Tags: database, analytics, search
Statut: published
Note: 4.2


In [32]:
# 4. FILTRES
# ==========

def filter_examples():
    """Exemples de filtres"""
    
    # Exemple 1: Filtre par plage de dates
    date_filter = {
        "query": {
            "bool": {
                "filter": [
                    {"range": {"date": {"gte": "2022-01-01", "lte": "2024-12-31"}}}
                ]
            }
        }
    }
    
    # Exemple 2: Filtres multiples combinés
    multi_filter = {
        "query": {
            "bool": {
                "filter": [
                    {"term": {"status": "published"}},
                    {"range": {"rating": {"gte": 4.0}}},
                    {"range": {"views": {"gte": 500}}}
                ]
            }
        }
    }
    
    # Exemple 3: Filtre avec recherche plein texte
    text_with_filter = {
        "query": {
            "bool": {
                "must": {
                    "match": {"content": "python elasticsearch"}
                },
                "filter": [
                    {"terms": {"tags": ["database", "search"]}}
                ]
            }
        }
    }
    
    # Exécution des requêtes
    date_results = es.search(index="articles", body=date_filter)
    multi_results = es.search(index="articles", body=multi_filter)
    text_filter_results = es.search(index="articles", body=text_with_filter)
    
    # Afficher les résultats
    print("\n1. Filtre par plage de dates (articles de 2022 à 2024):")
    print(f"Nombre de résultats: {date_results['hits']['total']['value']}")
    
    print("\n2. Filtres multiples (articles publiés, avec note ≥ 4.0 et vues ≥ 500):")
    print(f"Nombre de résultats: {multi_results['hits']['total']['value']}")
    
    print("\n3. Filtre avec recherche plein texte (python elasticsearch + tags database/search):")
    print(f"Nombre de résultats: {text_filter_results['hits']['total']['value']}")
    
    # Afficher les détails du premier résultat de chaque requête
    if date_results['hits']['hits']:
        source = date_results['hits']['hits'][0]['_source']
        print(f"\nPremier résultat du filtre par date:")
        print(f"Titre: {source['title']}")
        print(f"Date: {source['date']}")
    
    if multi_results['hits']['hits']:
        source = multi_results['hits']['hits'][0]['_source']
        print(f"\nPremier résultat des filtres multiples:")
        print(f"Titre: {source['title']}")
        print(f"Statut: {source['status']}")
        print(f"Note: {source['rating']}")
        print(f"Vues: {source['views']}")
    
    return {
        "date_filter": date_results,
        "multi_filter": multi_results,
        "text_with_filter": text_filter_results
    }

In [34]:
print("\n--- FILTRES ---")
filter_results = filter_examples()


--- FILTRES ---

1. Filtre par plage de dates (articles de 2022 à 2024):
Nombre de résultats: 12

2. Filtres multiples (articles publiés, avec note ≥ 4.0 et vues ≥ 500):
Nombre de résultats: 1

3. Filtre avec recherche plein texte (python elasticsearch + tags database/search):
Nombre de résultats: 4

Premier résultat du filtre par date:
Titre: Guide Data Science pour débutants
Date: 2023-05-20

Premier résultat des filtres multiples:
Titre: Guide Elasticsearch pour débutants
Statut: published
Note: 4.2
Vues: 693
