In [1]:
import csv
import uuid
from cassandra.cluster import Cluster

### Importer les données

In [2]:
# importer dataset 
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shivamb/netflix-shows")

print("Path to dataset files:", path)

Path to dataset files: /home/zoe/.cache/kagglehub/datasets/shivamb/netflix-shows/versions/5


In [3]:
# get le nom du fichier 
import os
files = os.listdir(path)
print(files)

['netflix_titles.csv']


In [4]:
import pandas as pd

filename = f"{path}/{files[0]}"
df = pd.read_csv(filename)  # Adapte le nom du fichier si nécessaire

In [5]:
from tabulate import tabulate

# Afficher le DataFrame en utilisant tabulate
print(tabulate(df.head(10), headers='keys', tablefmt='psql'))


+----+-----------+---------+----------------------------------+-------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------+--------------------+----------------+----------+------------+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|    | show_id   | type    | title                            | director                      | cast                                                                                                                                                

In [6]:
# types des colonnes
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [7]:
# Convertir la colonne show_id en int en enlevant le préfixe 's' (plus simple pour gérer exactement le meme type de données en Cassandra et MySQL)
df['show_id'] = df['show_id'].str.replace('s', '').astype(int)

In [8]:
print(df.dtypes)

show_id          int64
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [9]:
shows_table = """
CREATE TABLE shows (
    show_id INT PRIMARY KEY,
    title TEXT,
    director TEXT,
    cast TEXT,
    country TEXT,
    date_added TEXT,
    release_year INT,
    rating TEXT,
    duration TEXT,
    listed_in TEXT,
    description TEXT
)
"""

### Cassandra

In [10]:
# Connecte-toi au cluster Cassandra (adresse locale)
cluster = Cluster(['127.0.0.1'])  # Remplace par l'adresse IP si c'est un serveur distant
session = cluster.connect()

# Vérifie la connexion en exécutant une commande simple
row = session.execute("SELECT release_version FROM system.local").one()
print(f"Cassandra version: {row.release_version}")

Cassandra version: 4.0.14


In [11]:
# Lister les keyspaces existants 
keyspaces = session.execute("SELECT keyspace_name FROM system_schema.keyspaces")
for ks in keyspaces:
    print(ks.keyspace_name)

system_auth
system_schema
netflix
system_distributed
system
system_traces


In [12]:
session.execute("DROP KEYSPACE IF EXISTS netflix;")

session.execute("""
CREATE KEYSPACE IF NOT EXISTS netflix
WITH REPLICATION = {
    'class' : 'SimpleStrategy',
    'replication_factor' : 1
}
""")

session.set_keyspace('netflix')

session.execute("DROP TABLE IF EXISTS shows")

session.execute(shows_table)

<cassandra.cluster.ResultSet at 0x7ab8259a26b0>

### MySQL

In [None]:
from sqlalchemy import create_engine, Table, Column, Integer, Text, MetaData, text, String

# Configurer la connexion à la base de données
username = 'user'
password = 'password'
database = 'TDLE'

# Créer un moteur SQLAlchemy pour la connexion
mysql_engine = create_engine(f'mysql+mysqlconnector://{username}:{password}@localhost:3306/{database}')

with mysql_engine.connect() as conn:

    conn.execute(text("DROP TABLE IF EXISTS shows;"))

    conn.execute(text(shows_table))

## CRUD : Create, Read, Update, Delete
- Create (*Insert*) : Insérer des données dans la base de données.
- Read (*Select*) : Récupérer des données.
- Update (*Update*) : Modifier des données existantes.
- Delete (*Delete*) : Effacer des données.

### 1. Create 

# JE ME SUIS ARRETE LA ! LA SUITE C'EST POUR DEMAIN

In [None]:
# 1. Test d'insertion
def test_insert():
    # Cassandra
    start_time = time.time()
    for index, row in df.iterrows():
        cassandra_session.execute("""
        INSERT INTO shows (show_id, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (row['show_id'], row['title'], row['director'], row['cast'], row['country'], row['date_added'],
              row['release_year'], row['rating'], row['duration'], row['listed_in'], row['description']))
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    df.to_sql('shows', con=mysql_engine, if_exists='replace', index=False)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

In [None]:
# 2. Test de lecture
def test_select():
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("SELECT * FROM shows LIMIT 100")  # Limite à 100 pour la performance
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    result = pd.read_sql("SELECT * FROM shows LIMIT 100", con=mysql_engine)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

In [None]:
# 3. Test de mise à jour
def test_update():
    # Exemple de mise à jour pour un champ spécifique
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("""
    UPDATE shows SET rating = 'PG-13' WHERE show_id = '1'  -- Remplace par une condition adéquate
    """)
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    with mysql_engine.connect() as conn:
        conn.execute("""
        UPDATE shows SET rating = 'PG-13' WHERE show_id = '1'  -- Remplace par une condition adéquate
        """)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

In [None]:
# 4. Test de suppression
def test_delete():
    # Cassandra
    start_time = time.time()
    cassandra_session.execute("""
    DELETE FROM shows WHERE show_id = '1'  -- Remplace par une condition adéquate
    """)
    cassandra_time = time.time() - start_time

    # MySQL
    start_time = time.time()
    with mysql_engine.connect() as conn:
        conn.execute("""
        DELETE FROM shows WHERE show_id = '1'  -- Remplace par une condition adéquate
        """)
    mysql_time = time.time() - start_time

    return cassandra_time, mysql_time

# Exécution des tests
insert_times = test_insert()
select_times = test_select()
update_times = test_update()
delete_times = test_delete()

# Affichage des résultats
print(f"Insertion - Cassandra: {insert_times[0]:.5f}s, MySQL: {insert_times[1]:.5f}s")
print(f"Lecture - Cassandra: {select_times[0]:.5f}s, MySQL: {select_times[1]:.5f}s")
print(f"Mise à jour - Cassandra: {update_times[0]:.5f}s, MySQL: {update_times[1]:.5f}s")
print(f"Suppression - Cassandra: {delete_times[0]:.5f}s, MySQL: {delete_times[1]:.5f}s")


In [14]:

# Charger les données depuis le CSV
with open(filename, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        session.execute("""
        INSERT INTO shows (show_id, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (str(uuid.uuid4()), row['title'], row['director'], row['cast'], row['country'], row['date_added'], int(row['release_year']), row['rating'], row['duration'], row['listed_in'], row['description']))


InvalidRequest: Error from server: code=2200 [Invalid query] message="Invalid STRING constant (db53e8df-158c-4236-8d1a-f3c7233830eb) for "show_id" of type int"

In [None]:
with mysql_engine.connect() as conn:
    # Supprimer la table
    conn.execute("DROP TABLE IF EXISTS shows;")
    
    # Créer la table
    conn.execute("""
    CREATE TABLE shows (
        show_id TEXT PRIMARY KEY,
        title TEXT,
        director TEXT,
        cast TEXT,
        country TEXT,
        date_added TEXT,
        release_year INT,
        rating TEXT,
        duration TEXT,
        listed_in TEXT,
        description TEXT
    )
    """)

ObjectNotExecutableError: Not an executable object: 'DROP TABLE IF EXISTS shows;'

In [None]:
# Tout fermer
session.shutdown() # Fermer la connexion Cassandra
cluster.shutdown() # Fermer la connexion Cassandra
mysql_engine.dispose() # Fermer la connexion MySQL