# Nettoyage du dataset

In [0]:
Rdf = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv("gs://bucket-autoai/ratings.csv")

df = Rdf.toPandas()
display(df)

In [0]:
df.info()

In [0]:
df.duplicated().sum()

In [0]:
df.isna().sum()

In [0]:
import pandas as pd
# Standardiser les formats de date

df["rating_updated_on"] = pd.to_datetime(df["rating_updated_on"]) 

In [0]:
# Uniformiser les noms des colonnes(snake_case)
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace("-", "_", regex=False)
      .str.replace(" ", "_", regex=False)
)


In [0]:
df["rolloverpossibility"] = df["rolloverpossibility"].astype(float)
df["rolloverpossibility2"] = df["rolloverpossibility2"].astype(float)
df["modelyear"] = df["modelyear"].astype(int)
df["vehicleid"] = df["vehicleid"].astype(int)


In [0]:
rating_cols = df.select_dtypes(include="object").columns

df[rating_cols] = df[rating_cols].astype("category")

df.head()

##Nettoyage du texte

In [0]:
df[rating_cols] = (
    df[rating_cols]
    .apply(lambda col: col.str.strip().str.lower())
)


In [0]:
df[rating_cols] = df[rating_cols].replace({
    "not rated": None,
    "nr": None
})


## Enrichissement des données

In [0]:
# Variables temporelles
df["rating_year"] = df["rating_updated_on"].dt.year
df["rating_month"] = df["rating_updated_on"].dt.month
df["rating_quarter"] = df["rating_updated_on"].dt.to_period("Q")

In [0]:
# Flags qualité / Sécurité
## Véhicules à risques de retournements
df["high_rollover_risk"] = df["rolloverpossibility"] > 0.2

## Note globale faible
df["low_overall_rating"] = df["overallrating"].isin(["1 star", "2 stars"])
## Ces flags serviront à :
###croiser avec les plaintes SAV
###détecter des signaux faibles qualité

In [0]:
#Normalisation de l'id des véhicules
df["vehicle_key"] = (
    df["make"].str.lower() + "_" +
    df["model"].str.lower() + "_" +
    df["modelyear"].astype(str)
)
###Pour le cross matching avis/SAV/ratings

In [0]:
df.info()

In [0]:
df

In [0]:
cols_to_keep = [
    "make", "model", "modelyear", "vehicleid", "vehicle_key",
    "overallrating", "overallfrontcrashrating", "overallsidecrashrating",
    "rolloverrating", "rolloverpossibility", "sidepolecrashrating",
    "nhtsaelectronicstabilitycontrol",
    "nhtsaforwardcollisionwarning",
    "nhtsalanedeparturewarning",
    "rating_updated_on", "rating_year", "rating_month", "rating_quarter"
]

df_clean = df[cols_to_keep].copy()


In [0]:
df_clean