In [1]:
import csv
import uuid
import time 
from cassandra.cluster import Cluster
from sqlalchemy import create_engine, text
import kagglehub

### Importer les données

In [2]:
# Download latest version
path = kagglehub.dataset_download("shivamb/netflix-shows")

print("Path to dataset files:", path)

Path to dataset files: /home/zoe/.cache/kagglehub/datasets/shivamb/netflix-shows/versions/5


In [3]:
# get le nom du fichier 
import os
files = os.listdir(path)
print(files)

['netflix_titles.csv']


In [4]:
import pandas as pd

filename = f"{path}/{files[0]}"
df = pd.read_csv(filename)  # Adapte le nom du fichier si nécessaire

In [5]:
from tabulate import tabulate

# Afficher le DataFrame en utilisant tabulate
print(tabulate(df.head(10), headers='keys', tablefmt='psql'))


+----+-----------+---------+----------------------------------+-------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------+--------------------+----------------+----------+------------+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|    | show_id   | type    | title                            | director                      | cast                                                                                                                                                

In [6]:
# types des colonnes
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [7]:
# Convertir la colonne show_id en int en enlevant le préfixe 's' (plus simple pour gérer exactement le meme type de données en Cassandra et MySQL)
df['show_id'] = df['show_id'].str.replace('s', '').astype(int).astype(int)

In [8]:
print(df.dtypes)

show_id          int64
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [9]:
# est ce qu'il y a des NaN ?
print(df.isna().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [10]:
# remplir les NaN
df['director'] = df['director'].fillna('')
df['cast'] = df['cast'].fillna('')
df['country'] = df['country'].fillna('')
df['date_added'] = df['date_added'].fillna('')
df['rating'] = df['rating'].fillna('')
df['duration'] = df['duration'].fillna('')

print(df.isna().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [11]:
shows_table = """
CREATE TABLE shows (
    show_id INT PRIMARY KEY,
    title TEXT,
    director TEXT,
    cast TEXT,
    country TEXT,
    date_added TEXT,
    release_year INT,
    rating TEXT,
    duration TEXT,
    listed_in TEXT,
    description TEXT
)
"""

### Cassandra

In [12]:
# Connecte-toi au cluster Cassandra (adresse locale)
cluster = Cluster(['127.0.0.1'])  # Remplace par l'adresse IP si c'est un serveur distant
cassandra_session = cluster.connect()

# Vérifie la connexion en exécutant une commande simple
row = cassandra_session.execute("SELECT release_version FROM system.local").one()
print(f"Cassandra version: {row.release_version}")

Cassandra version: 4.0.14


In [13]:
# Lister les keyspaces existants 
keyspaces = cassandra_session.execute("SELECT keyspace_name FROM system_schema.keyspaces")
for ks in keyspaces:
    print(ks.keyspace_name)

system_auth
system_schema
netflix
system_distributed
system
system_traces


In [14]:
cassandra_session.execute("DROP KEYSPACE IF EXISTS netflix;")

cassandra_session.execute("""
CREATE KEYSPACE IF NOT EXISTS netflix
WITH REPLICATION = {
    'class' : 'SimpleStrategy',
    'replication_factor' : 1
}
""")

cassandra_session.set_keyspace('netflix')

cassandra_session.execute("DROP TABLE IF EXISTS shows")

cassandra_session.execute(shows_table)

<cassandra.cluster.ResultSet at 0x75c885b58340>

### MySQL

In [15]:
# Configurer la connexion à la base de données
username = 'user'
password = 'password'
database = 'TDLE'

# Créer un moteur SQLAlchemy pour la connexion
mysql_engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@localhost:3306/{database}')

with mysql_engine.connect() as conn:
    conn.execute(text("DROP TABLE IF EXISTS shows;"))
    conn.execute(text(shows_table))

## CRUD : Create, Read, Update, Delete
- Create (*Insert*) : Insérer des données dans la base de données.
- Read (*Select*) : Récupérer des données.
- Update (*Update*) : Modifier des données existantes.
- Delete (*Delete*) : Effacer des données.

In [16]:
df['show_id'] = df['show_id'].astype(int)
df['release_year'] = df['release_year'].astype(int)

### 1. Create 

In [17]:
# 1. Test d'insertion
def test_insert():
    # Cassandra
    insert_query = cassandra_session.prepare("INSERT INTO shows (show_id, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
    start_time = time.time()
    for _, row in df.iterrows():
        cassandra_session.execute(insert_query, (row['show_id'], row['title'], row['director'], row['cast'], row['country'], row['date_added'], row['release_year'], row['rating'], row['duration'], row['listed_in'], row['description']))
    cassandra_time = time.time() - start_time

    # MySQL
    insert_query_mysql = """
    INSERT INTO shows (show_id, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description) 
    VALUES (:show_id, :title, :director, :cast, :country, :date_added, :release_year, :rating, :duration, :listed_in, :description)
    """
    with mysql_engine.connect() as conn:
        conn.execute(text("USE TDLE;"))
        start_time = time.time()
        for _, row in df.iterrows():
            conn.execute(text(insert_query_mysql), row.to_dict())

        conn.commit()  # Effectuer le commit : : Le commit fait partie intégrante du processus d'insertion dans une base de données relationnelle. Si les insertions ne sont pas validées par un commit, les données ne seront pas enregistrées. Par conséquent, la durée de cette opération est significative pour évaluer la performance globale de la transaction.
        mysql_time = time.time() - start_time


    return cassandra_time, mysql_time

In [18]:
cassandra_time, mysql_time = test_insert()
print(f"Insertion time: Cassandra {cassandra_time:.5f}s, MySQL {mysql_time:.5f}s")

Insertion time: Cassandra 4.06159s, MySQL 2.22347s


# STOP ICI : la suite est à faire 

### 2. Read

In [19]:
# 2. Test de lecture
def test_select():
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("SELECT * FROM shows LIMIT 100")  # Limite à 100 pour la performance
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    result = pd.read_sql("SELECT * FROM shows LIMIT 100", con=mysql_engine)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

In [20]:
cassandra_time, mysql_time = test_select()
print(f"Select time: Cassandra {cassandra_time:.5f}s, MySQL {mysql_time:.5f}s")

Select time: Cassandra 0.00236s, MySQL 0.00368s


### 3. Update

In [21]:
# 3. Test de mise à jour
def test_update():
    # Exemple de mise à jour pour un champ spécifique
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("""
    UPDATE shows SET rating = 'PG-13' WHERE show_id = '1'  -- Remplace par une condition adéquate
    """)
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    with mysql_engine.connect() as conn:
        conn.execute("""
        UPDATE shows SET rating = 'PG-13' WHERE show_id = '1'  -- Remplace par une condition adéquate
        """)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

In [22]:
cassandra_time, mysql_time = test_update()
print(f"Update time: Cassandra {cassandra_time:.5f}s, MySQL {mysql_time:.5f}s")

InvalidRequest: Error from server: code=2200 [Invalid query] message="Invalid STRING constant (1) for "show_id" of type int"

### 4. Delete

In [None]:
# 4. Test de suppression
def test_delete():
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("""
    DELETE FROM shows WHERE show_id = '1'  -- Remplace par une condition adéquate
    """)
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    with mysql_engine.connect() as conn:
        conn.execute("""
        DELETE FROM shows WHERE show_id = '1'  -- Remplace par une condition adéquate
        """)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

InvalidRequest: Error from server: code=2200 [Invalid query] message="Invalid FLOAT constant (NaN) for "cast" of type text"

In [None]:
cassandra_time, mysql_time = test_delete()
print(f"Delete time: Cassandra {cassandra_time:.5f}s, MySQL {mysql_time:.5f}s")

## Fin du notebook

In [None]:
# Tout fermer
cassandra_session.shutdown() # Fermer la connexion Cassandra
cluster.shutdown() # Fermer la connexion Cassandra
mysql_engine.dispose() # Fermer la connexion MySQL