In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [6]:
df = pd.read_csv("../data/Iphone_Sales.csv")

df.rename(columns={
    "NOMBRE_CMD": "sales",
    "BASIC_MODEL_NAME": "model",
    "SEMAINE": "date_str",
}, inplace=True)

In [7]:
df["date"] = pd.to_datetime(df["date_str"])
print("Date range:", df["date"].min(), "→", df["date"].max())

Date range: 2022-11-12 00:00:00+00:00 → 2025-02-15 00:00:00+00:00


In [8]:
df["launch_date"] = pd.to_datetime(df["DATE_LANCEMENT"], errors="coerce")

In [9]:
df = df.sort_values(["model", "date"]).reset_index(drop=True)

In [11]:
continuity_check = (
    df.groupby("model")["date"]
    .agg(
        min_date="min",
        max_date="max",
        num_records="count"
    )
    .reset_index()
)
continuity_check["expected_weeks"] = (
    (continuity_check["max_date"] - continuity_check["min_date"]).dt.days // 7 + 1
)
continuity_check["is_continuous"] = continuity_check["num_records"] == continuity_check["expected_weeks"]
continuity_check.sort_values("is_continuous", ascending=True).head(10)

Unnamed: 0,model,min_date,max_date,num_records,expected_weeks,is_continuous
0,IPHONE 12,2022-11-12 00:00:00+00:00,2023-12-16 00:00:00+00:00,50,58,False
1,IPHONE 13,2022-11-12 00:00:00+00:00,2025-02-08 00:00:00+00:00,106,118,False
2,IPHONE 14,2022-11-12 00:00:00+00:00,2025-02-15 00:00:00+00:00,119,119,True
3,IPHONE 15,2023-09-09 00:00:00+00:00,2025-02-15 00:00:00+00:00,76,76,True
4,IPHONE 16,2024-09-07 00:00:00+00:00,2025-02-15 00:00:00+00:00,24,24,True


In [12]:
df["product_age_weeks"] = ((df["date"] - df["launch_date"]).dt.days // 7).clip(lower=0)

In [13]:
df.rename(columns={"RABAIS": "is_promo"}, inplace=True)

In [14]:
df["week_of_year"] = df["date"].dt.isocalendar().week
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

In [16]:
df.rename(columns={"ANCIENNETE_MODELE": "product_age_years"}, inplace=True)

In [17]:
df.head()

Unnamed: 0,date_str,model,DATE_LANCEMENT,sales,product_age_years,PRIX_DE_DETAIL,is_promo,NOMBRE_CLIENT_FIN_CONTRAT,CROIS_BYOD,SEMAINE_NUM,Fct_CROIS,date,launch_date,product_age_weeks,week_of_year,month,year
0,2022-11-12T00:00:00Z,IPHONE 12,2020-10-16T00:00:00Z,85,2,882.0,0,115.0,0.2,1,,2022-11-12 00:00:00+00:00,2020-10-16 00:00:00+00:00,108,45,11,2022
1,2022-11-19T00:00:00Z,IPHONE 12,2020-10-16T00:00:00Z,58,2,882.0,0,140.0,0.2,2,,2022-11-19 00:00:00+00:00,2020-10-16 00:00:00+00:00,109,46,11,2022
2,2022-11-26T00:00:00Z,IPHONE 12,2020-10-16T00:00:00Z,108,2,882.0,0,119.0,0.2,3,,2022-11-26 00:00:00+00:00,2020-10-16 00:00:00+00:00,110,47,11,2022
3,2022-12-03T00:00:00Z,IPHONE 12,2020-10-16T00:00:00Z,257,2,882.0,0,116.0,0.2,4,,2022-12-03 00:00:00+00:00,2020-10-16 00:00:00+00:00,111,48,12,2022
4,2022-12-10T00:00:00Z,IPHONE 12,2020-10-16T00:00:00Z,235,2,882.0,0,128.0,0.2,5,,2022-12-10 00:00:00+00:00,2020-10-16 00:00:00+00:00,112,49,12,2022


In [18]:
df[["model", "date", "SEMAINE_NUM", "sales", "is_promo", "product_age_weeks", "product_age_years", "PRIX_DE_DETAIL", "NOMBRE_CLIENT_FIN_CONTRAT", "CROIS_BYOD", "week_of_year", "month", "year"]].head()

Unnamed: 0,model,date,SEMAINE_NUM,sales,is_promo,product_age_weeks,product_age_years,PRIX_DE_DETAIL,NOMBRE_CLIENT_FIN_CONTRAT,CROIS_BYOD,week_of_year,month,year
0,IPHONE 12,2022-11-12 00:00:00+00:00,1,85,0,108,2,882.0,115.0,0.2,45,11,2022
1,IPHONE 12,2022-11-19 00:00:00+00:00,2,58,0,109,2,882.0,140.0,0.2,46,11,2022
2,IPHONE 12,2022-11-26 00:00:00+00:00,3,108,0,110,2,882.0,119.0,0.2,47,11,2022
3,IPHONE 12,2022-12-03 00:00:00+00:00,4,257,0,111,2,882.0,116.0,0.2,48,12,2022
4,IPHONE 12,2022-12-10 00:00:00+00:00,5,235,0,112,2,882.0,128.0,0.2,49,12,2022


In [19]:
df[["model", "date", "SEMAINE_NUM", "sales", "is_promo", "product_age_weeks", "product_age_years", "PRIX_DE_DETAIL", "NOMBRE_CLIENT_FIN_CONTRAT", "CROIS_BYOD", "week_of_year", "month", "year"]].to_csv("../data/processed_sales.csv", index=False)
print("✅ Cleaned dataset saved to data/processed_sales.csv")

✅ Cleaned dataset saved to data/processed_sales.csv
