In [69]:
#Ici , on va appliquer les algorithmes de machine learning sur le dataset de succès commercial des films.
#On affiche d'abord la tete du dataset
import pandas as pd 
df = pd.read_csv('movie_dataset_critical_success.csv')
print(df.head())

      budget                                    genres original_language  \
0  237000000  Action Adventure Fantasy Science Fiction                en   
1  300000000                  Adventure Fantasy Action                en   
2  245000000                    Action Adventure Crime                en   
3  250000000               Action Crime Drama Thriller                en   
4  260000000          Action Adventure Science Fiction                en   

      revenue  runtime  vote_count           director  Action  Adventure  \
0  2787965087    162.0       11800      James Cameron    True       True   
1   961000000    169.0        4500     Gore Verbinski    True       True   
2   880674609    148.0        4466         Sam Mendes    True       True   
3  1084939099    165.0        9106  Christopher Nolan    True      False   
4   284139100    132.0        2124     Andrew Stanton    True       True   

   Animation  ...  Movie  Music  Mystery  Romance  Science     TV  Thriller  \
0      

In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score

# -----------------------------
# Chargement du dataset
# -----------------------------
df = pd.read_csv('movie_dataset_critical_success.csv')

# -----------------------------
# One-Hot Encoding des colonnes catégorielles
# -----------------------------
categorical_cols = [col for col in ['genres', 'director', 'original_language'] if col in df.columns]
print("Colonnes catégorielles encodées :", categorical_cols)
df_model = pd.get_dummies(df, columns=categorical_cols)

# Conversion des booléens en int
for col in df_model.columns:
    if df_model[col].dtype == 'bool':
        df_model[col] = df_model[col].astype(int)

# -----------------------------
# Séparation features / target
# -----------------------------
X = df_model.drop('succes_critique', axis=1)
y = df_model['succes_critique']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)







Colonnes catégorielles encodées : ['genres', 'director', 'original_language']


In [86]:
# -----------------------------
# 1) Decision Tree
# -----------------------------
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)
precision_dt = precision_score(y_test, y_pred_dt, pos_label=1)
print(f'Precision Decision Tree: {precision_dt:.4f}')

Precision Decision Tree: 0.6749


In [None]:
# -----------------------------
# 2) Random Forest
# -----------------------------
clf_rf = RandomForestClassifier(n_estimators=300, random_state=42 , max_depth=10) 
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
precision_rf = precision_score(y_test, y_pred_rf, pos_label=1)
print(f'Precision Random Forest: {precision_rf:.4f}')


Precision Random Forest: 0.9130


In [79]:
# -----------------------------
# 3) Naive Bayes
# -----------------------------
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)
precision_nb = precision_score(y_test, y_pred_nb, pos_label=1)
print(f'Precision Naive Bayes: {precision_nb:.4f}')


Precision Naive Bayes: 0.5437


In [80]:
# -----------------------------
# 4) K-Nearest Neighbors
# -----------------------------
clf_knn = KNeighborsClassifier(n_neighbors=2)
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
precision_knn = precision_score(y_test, y_pred_knn, pos_label=1)
print(f'Precision KNN: {precision_knn:.4f}')


Precision KNN: 0.5536


In [81]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np

# -----------------------------
# Supposons que X et y sont déjà définis
# -----------------------------

# Valeurs de cross-validation
cv_values = range(2, 11)  # de 2 à 10 folds pour l'exemple

# Stockage des résultats
precision_dt = []
precision_rf = []
precision_nb = []
precision_knn_dict = {}  # pour plusieurs k

# Valeurs de k pour KNN
k_values = [1, 3, 5, 7, 9]

# Boucle sur cv
for cv in cv_values:
    # Decision Tree
    clf_dt = DecisionTreeClassifier(random_state=42)
    scores_dt = cross_val_score(clf_dt, X, y, cv=cv, scoring='precision')
    precision_dt.append(scores_dt.mean())
    
    # Random Forest
    clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    scores_rf = cross_val_score(clf_rf, X, y, cv=cv, scoring='precision')
    precision_rf.append(scores_rf.mean())
    
    # Naive Bayes
    clf_nb = GaussianNB()
    scores_nb = cross_val_score(clf_nb, X, y, cv=cv, scoring='precision')
    precision_nb.append(scores_nb.mean())
    
    # KNN pour chaque k
    for k in k_values:
        clf_knn = KNeighborsClassifier(n_neighbors=k)
        scores_knn = cross_val_score(clf_knn, X, y, cv=cv, scoring='precision')
        if k not in precision_knn_dict:
            precision_knn_dict[k] = []
        precision_knn_dict[k].append(scores_knn.mean())

# -----------------------------
# Graphique
# -----------------------------
plt.figure(figsize=(12, 6))
plt.plot(cv_values, precision_dt, marker='o', label='Decision Tree')
plt.plot(cv_values, precision_rf, marker='o', label='Random Forest')
plt.plot(cv_values, precision_nb, marker='o', label='Naive Bayes')

for k in k_values:
    plt.plot(cv_values, precision_knn_dict[k], marker='x', linestyle='--', label=f'KNN k={k}')

plt.title('Précision moyenne en cross-validation selon cv et k (KNN)')
plt.xlabel('Nombre de folds (cv)')
plt.ylabel('Précision moyenne')
plt.grid(True)
plt.legend()
plt.show()



KeyboardInterrupt: 