In [1]:
import pandas as pd
import numpy as np


In [17]:
movies = pd.read_csv('datasets/movies.csv') 
ratings = pd.read_csv('datasets/ratings.csv')
users = pd.read_csv('datasets/users.csv')

In [18]:
movies.count()


movieId         62423
title           62423
genres          62423
actors          62423
realizations    62423
date            62013
dtype: int64

In [50]:
users.count()

userId        162541
name          162541
gender        162541
age           162541
occupation    162541
zip_code      162541
dtype: int64

In [51]:
ratings.count()


userId       25000095
movieId      25000095
rating       25000095
timestamp    25000095
dtype: int64

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [53]:
# afficher la ligne 147 movies  
movies.iloc[150]

movieId                      152
title      Addiction, The (1995)
genres              Drama|Horror
Name: 150, dtype: object

In [12]:
import re

def clean_movie_titles(df, title_col="title"):
    def process_title(title):
        # Supprimer les parenthèses contenant une traduction (ex: (Confessionnal, Le))
        title = re.sub(r"\([^)]*, [^)]+\)", "", title)
        # Supprimer les virgules
        title = title.replace(",", "")
        # Supprimer les espaces multiples
        title = re.sub(r"\s+", " ", title).strip()
        return title

    df[title_col] = df[title_col].apply(process_title)
    return df

# Appliquer la fonction de nettoyage sur le DataFrame movies
movies = clean_movie_titles(movies)


In [13]:
movies.iloc[150]


movieId                                              152
title                                      Addiction The
genres                                      Drama|Horror
actors          Diana Barnett|Curtis Hanna|Michele Johns
realizations                Jodi Rodriguez|Michael Perry
date                                                1995
Name: 150, dtype: object

In [19]:

# enregistrer le fichier movies.csv
movies.to_csv('datasets/movies.csv', index=False)

In [57]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
from faker import Faker
import pandas as pd
import random

fake = Faker()

# Exemple : ratings DataFrame déjà chargé avec une colonne 'userId'
user_ids = ratings['userId'].unique()

# Générer des données utilisateur aléatoires
def generate_user(uid):
    gender = random.choice(['M', 'F'])  # M = Homme, F = Femme
    if gender == 'M':
        name = fake.first_name_male() + " " + fake.last_name()
    else:
        name = fake.first_name_female() + " " + fake.last_name()
    age = random.randint(12, 70)
    occupation = random.randint(1, 30)
    zip_code = fake.zipcode()
    return {'userId': uid, 'name': name, 'gender': gender, 'age': age, 'occupation': occupation, 'zip_code': zip_code}

# Générer les utilisateurs
user_data = [generate_user(uid) for uid in user_ids]
users = pd.DataFrame(user_data)

# Générer les acteurs et réalisateurs sous forme de string avec |
def generate_actors():
    num_actors = random.randint(1, 4)
    return "|".join(fake.first_name() + " " + fake.last_name() for _ in range(num_actors))

def generate_directors():
    num_directors = random.randint(1, 2)
    return "|".join(fake.first_name() + " " + fake.last_name() for _ in range(num_directors))

# Ajouter les colonnes 'actors' et 'realizations' sous forme de texte
movies['actors'] = movies['movieId'].apply(lambda x: generate_actors())
movies['realizations'] = movies['movieId'].apply(lambda x: generate_directors())

# Aperçu
print(movies.head())

# Sauvegarder les users
users.to_csv('datasets/users.csv', index=False)

# Sauvegarder les movies avec actors et realizations en | string
movies.to_csv('datasets/movies.csv', index=False)


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                              actors  \
0                         Tina Stewart|Jessica Smith   
1  Richard Sanchez|David Walker|Carol Rodriguez|J...   
2  Nicholas Ramsey|Donna Williams|David Huerta|Ch...   
3                                      Curtis Wright   
4                      David Rogers|Stanley Galloway   

                     realizations  
0  Jeremy Hendricks|Sonya

In [11]:
import re

# Fonction pour séparer le titre et la date
def split_title_and_date(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        year = match.group(1)
        title_clean = re.sub(r'\(\d{4}\)', '', title).strip()
    else:
        year = None
        title_clean = title
    return title_clean, year

# Appliquer sur le DataFrame
movies[['clean_title', 'date']] = movies['title'].apply(lambda x: pd.Series(split_title_and_date(x)))

# Remplacer l'ancienne colonne Title par clean_title
movies['title'] = movies['clean_title']

# Supprimer la colonne temporaire clean_title
movies = movies.drop(columns=['clean_title'])

# Aperçu
print(movies.head())

# Sauvegarder le résultat
movies.to_csv('datasets/movies.csv', index=False)


   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                              actors  \
0                         Tina Stewart|Jessica Smith   
1  Richard Sanchez|David Walker|Carol Rodriguez|J...   
2  Nicholas Ramsey|Donna Williams|David Huerta|Ch...   
3                                      Curtis Wright   
4                      David Rogers|Stanley Galloway   

                     realizations  date  
0  Jeremy Hendricks|Sonya Edwards  1995  
1    Edward William

In [15]:
def export_to_dat(df: pd.DataFrame, filename: str):
    """
    Exporte un DataFrame au format .dat avec '::' comme séparateur multi-caractères.
    
    Args:
        df (pd.DataFrame): Le DataFrame à exporter.
        filename (str): Le nom du fichier (sans .dat).
    """
    temp_csv = f"{filename}_temp.csv"
    final_dat = f"{filename}.dat"

    # 1. Sauvegarde temporaire en CSV avec une virgule comme séparateur
    df.to_csv(temp_csv, index=False, header=False, lineterminator="\n", encoding='utf-8')

    # 2. Remplace les virgules par ::
    with open(temp_csv, 'r', encoding='utf-8') as infile, open(final_dat, 'w', encoding='utf-8') as outfile:
        for line in infile:
            outfile.write(line.strip().replace(',', '::') + '\n')

    print(f"✅ Fichier .dat généré avec succès : {final_dat}")


In [None]:
export_to_dat(movies, 'datasets/movies')
export_to_dat(ratings, 'datasets/ratings')
export_to_dat(users, 'datasets/users')

✅ Fichier .dat généré avec succès : datasets/movies.dat


In [44]:
ratings_sample = ratings.sample(n=10000, random_state=42)
ratings_sample.head()


Unnamed: 0,userId,movieId,rating,timestamp
15347762,99476,104374,3.5,1467897440
16647840,107979,2634,4.0,994007728
23915192,155372,1614,3.0,1097887531
10052313,65225,7153,4.0,1201382275
12214125,79161,500,5.0,1488915363


In [45]:
selected_user_ids = ratings_sample['userId'].unique()
selected_movie_ids = ratings_sample['movieId'].unique()

# Filtrer les utilisateurs et films sélectionnés
filtered_users = users[users['userId'].isin(selected_user_ids)]
filtered_movies = movies[movies['movieId'].isin(selected_movie_ids)]
filtered_users = filtered_users.sort_values(by='userId')
filtered_movies = filtered_movies.sort_values(by='movieId')

In [46]:
ratings_sample.to_csv('datasets/ratings_sample.csv', index=False)
filtered_movies.to_csv('datasets/movies_sample.csv', index=False)
filtered_users.to_csv('datasets/users_sample.csv', index=False)

In [47]:
export_to_dat(filtered_movies, 'datasets/movies')
export_to_dat(ratings_sample, 'datasets/ratings')
export_to_dat(filtered_users, 'datasets/users')

✅ Fichier .dat généré avec succès : datasets/movies.dat
✅ Fichier .dat généré avec succès : datasets/ratings.dat
✅ Fichier .dat généré avec succès : datasets/users.dat
