In [1]:
###Manipulaion
import pandas as pd
import numpy as np
from math import sqrt

###Preprocessing
from sklearn.preprocessing import MinMaxScaler,RobustScaler, Binarizer,PolynomialFeatures,LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.impute import SimpleImputer,KNNImputer

###Modeles
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,RidgeCV

###MachineLearning
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

###Base de donnée
from sqlalchemy import create_engine, text

###Metrics
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
###Importation des données depuis la base de donnée
def load_data():
    #TitreBasic=pd.read_csv("C:/Users/naouf/Documents/Naoufel/projet/Netfloox/Documents/title.ratings.tsv/titleratings.tsv", sep='\t',nrows=100000)
    #TitreRating=pd.read_csv("C:/Users/naouf/Documents/Naoufel/projet/Netfloox/Documents/title.basics.tsv/titlebasics.tsv", sep='\t',nrows=100000)

    DATABASE_URI = "postgresql+psycopg2://citus:floox2024!@c-groupe5.ljbgwobn4cx2bv.postgres.cosmos.azure.com:5432/netfloox?sslmode=require"
    engine = create_engine(DATABASE_URI)
    sql_queries = text('SELECT * FROM datanetfloox.predictscore')
    df = pd.read_sql(sql_queries, engine)
    
    return  df
df = load_data()
    


#TitreBasic,TitreRating=load_data()
#df = pd.merge(TitreBasic,TitreRating, on='tconst',how='left')

In [None]:
###Un peu de préprocessing pour préparer les données pour la pipeline
df = df.drop_duplicates()
df = df.replace('\\N', np.nan)
df = df.dropna(subset=["averageRating"])

In [None]:
###définition de la target et des features
y = df["averageRating"]
X = df.drop(["averageRating"], axis=1)
###Définition des set de test et d'entrainement 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
###définition des colomnes catégorielles et numeriques
numeric_features = X.select_dtypes(exclude=['object']).columns
categorial_features = X.select_dtypes(include=['object']).columns

In [None]:
categorial_features

In [None]:
numeric_features

In [None]:
###définition d'une pipeline de préprocessing numerique
pipe_num = Pipeline(steps=[
            ('imputer',SimpleImputer(strategy="median")),
            ('scaler', MinMaxScaler())
           ])

In [None]:
###définition d'une pipeline de préprocessing catégoriel
pipe_text = Pipeline(steps=[
            ('imputer',SimpleImputer(strategy="most_frequent")),
            ("vectorizer",OneHotEncoder(handle_unknown='ignore'))
        ])

In [None]:
###transformation des données à l'aide de colomntransformers
preprocessor = ColumnTransformer(
        transformers=[
            ('scaler',pipe_num,numeric_features),
            ('text_encodeur',pipe_text,categorial_features)
            
        ])

In [None]:
###regrouppement préprocessing et modele
pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('Model',LinearRegression())
        ])

In [None]:
###définition d'une liste de scorings pour le Grid Search
scoring_metrics = ['r2','neg_mean_squared_error','neg_mean_absolute_error','neg_root_mean_squared_error']

In [None]:
###définition d'une liste de modeles et de parametres associés pour le Grid Search
params =[
        
        {   
            'Model':[RandomForestRegressor()],
            'Model__max_depth':[2,9,50],
            'Model__min_samples_split':[3,10,50],
            'Model__criterion':["poisson"]
         
         }
        ]

In [None]:
###définition du Grid Search
grid = GridSearchCV(pipeline, param_grid=params, cv=5 , scoring=scoring_metrics, refit='neg_mean_absolute_error', n_jobs=14)

In [None]:
###Recherche du Gri Search
grid.fit(X_train, y_train)
y_pred=grid.predict(X_test) 
    

In [None]:
###Recupération du meilleur modele
best_model = grid.best_estimator_
print('Modele retenu:',best_model)

In [None]:
###Prédiction à l'aide de notre meilleur modèle sur les données de test
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

In [None]:
###Metriques et évaluation du programme
print('r2_score',r2_score(y_test, y_pred))
print('mean_absolute_error',mean_absolute_error(y_test, y_pred))
print('mean_squared_error',mean_squared_error(y_test, y_pred))
print('root_mean_squared_error',sqrt(mean_squared_error(y_test, y_pred)))