# Prédiction de la consommation en carburant de voitures récentes.


### Rouler cette cellule pour installer les packages necessaires

In [45]:
using Pkg

Pkg.add([
    "CSV",                # Pour charger les fichiers CSV
    "DataFrames",         # Pour utiliser la structure DataFrame pour stocker les données
    "Distributions",      # Pour utiliser les lois de probabilités classiques
    "Gadfly",             # Pour tracer des graphiques
    "Statistics",         # Pour les fonctions statistiques de base
    "LinearAlgebra",
    "Random"
])

[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m ManualMemory ───────────────────── v0.1.8
[32m[1m   Installed[22m[39m CloseOpenIntervals ─────────────── v0.1.13
[32m[1m   Installed[22m[39m SLEEFPirates ───────────────────── v0.6.43
[32m[1m   Installed[22m[39m CPUSummary ─────────────────────── v0.2.6
[32m[1m   Installed[22m[39m VectorizationBase ──────────────── v0.21.71
[32m[1m   Installed[22m[39m ForceImport ────────────────────── v0.0.3
[32m[1m   Installed[22m[39m ChainRules ─────────────────────── v1.72.1
[32m[1m   Installed[22m[39m IfElse ─────────────────────────── v0.1.1
[32m[1m   Installed[22m[39m StructArrays ───────────────────── v0.6.18
[32m[1m   Installed[22m[39m LayoutPointers ─────────────────── v0.1.17
[32m[1m   Installed[22m[39m HostCPUFeatures ────────────────── v0.1.17
[32m[1m   Installed[22m[39m BitTwiddlingConvenienceFunctions ─ v0.1.6
[32m[1m   Installed[22m[39m PolyesterWeave ─────

### Importation de librairies

In [467]:
using CSV              # Pour charger les fichiers CSV
using DataFrames       # Pour utiliser la structure DataFrame pour stocker les données
using Distributions    # Pour utiliser les lois de probabilités classiques
using Gadfly           # Pour tracer des graphiques
using Statistics       # Pour les functions statistiques de base
using LinearAlgebra
using Random

### Chargement des données train et test

In [468]:
train_data = CSV.read("train.csv", DataFrame);
test_data = CSV.read("test.csv", DataFrame);

In [469]:
first(train_data, 10)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15,String31
1,2023,voiture_moyenne,8,44,integrale,automatique,138358823529412
2,2020,VUS_petit,4,2,integrale,automatique,980041666666667
3,2021,voiture_compacte,6,33,propulsion,automatique,117605
4,2023,voiture_deux_places,8,5,integrale,automatique,130672222222222
5,2022,voiture_moyenne,8,44,integrale,automatique,138358823529412
6,2022,voiture_moyenne,8,44,integrale,automatique,138358823529412
7,2022,voiture_minicompacte,3,15,traction,automatique,73503125
8,2024,voiture_minicompacte,3,15,traction,manuelle,758741935483871
9,2020,VUS_standard,6,38,integrale,automatique,112004761904762
10,2019,voiture_compacte,6,33,propulsion,automatique,117605


In [470]:
first(test_data, 10)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15
1,2014,voiture_moyenne,4,25,traction,manuelle
2,2014,voiture_moyenne,4,25,traction,automatique
3,2014,VUS_petit,4,25,traction,automatique
4,2014,VUS_petit,4,2,4x4,automatique
5,2014,voiture_sous_compacte,8,58,propulsion,manuelle
6,2014,voiture_sous_compacte,8,5,propulsion,automatique
7,2014,voiture_sous_compacte,8,5,propulsion,manuelle
8,2014,VUS_petit,4,24,4x4,automatique
9,2014,VUS_petit,6,35,integrale,automatique
10,2014,voiture_deux_places,10,52,integrale,manuelle


In [471]:
# Transformer les colonnes supposer etre numeric de String en Float pour les données train et test
train_data.cylindree = parse.(Float64, replace.(train_data.cylindree, "," => "."));
train_data.consommation = parse.(Float64, replace.(train_data.consommation, "," => "."));
test_data.cylindree = parse.(Float64, replace.(test_data.cylindree, "," => "."));

In [472]:
categorical_columns = [:type, :transmission, :boite]
train_data = select(train_data, Not(categorical_columns))
test_data = select(test_data, Not(categorical_columns))

Row,annee,nombre_cylindres,cylindree
Unnamed: 0_level_1,Int64,Int64,Float64
1,2014,4,2.5
2,2014,4,2.5
3,2014,4,2.5
4,2014,4,2.0
5,2014,8,5.8
6,2014,8,5.0
7,2014,8,5.0
8,2014,4,2.4
9,2014,6,3.5
10,2014,10,5.2


In [473]:
function feature_engineering!(df)
    df[!, :cylindree_squared] = df[!, :cylindree] .^ 2
    df[!, :annee_squared] = df[!, :annee] .^ 2
    df[!, :log_cylindree] = log.(df[!, :cylindree] .+ 1e-6)
    df[!, :cylindree_x_nombre_cylindres] = df[!, :cylindree] .* df[!, :nombre_cylindres]
end

feature_engineering! (generic function with 1 method)

In [474]:
feature_engineering!(train_data)
feature_engineering!(test_data)

150-element Vector{Float64}:
 10.0
 10.0
 10.0
  8.0
 46.4
 40.0
 40.0
  9.6
 21.0
 52.0
 52.0
 70.80000000000001
  8.0
  ⋮
  6.4
 19.799999999999997
 40.0
  8.0
  8.0
  6.4
  8.0
  8.0
 18.0
 18.0
  4.5
  8.0

In [475]:
println("\nSummary statistics of the training dataset:")
describe(train_data)


Summary statistics of the training dataset:


Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Real,Float64,Real,Int64,DataType
1,annee,2019.19,2014.0,2019.0,2024.0,0,Int64
2,nombre_cylindres,5.28535,3.0,4.0,12.0,0,Int64
3,cylindree,2.90303,1.2,2.5,6.8,0,Float64
4,consommation,10.3568,4.52327,10.2265,16.8007,0,Float64
5,cylindree_squared,9.88081,1.44,6.25,46.24,0,Float64
6,annee_squared,4077160.0,4056196.0,4076360.0,4096576.0,0,Int64
7,log_cylindree,0.98687,0.182322,0.916291,1.91692,0,Float64
8,cylindree_x_nombre_cylindres,17.3306,3.6,10.0,72.0,0,Float64


In [476]:
println("\nChecking for missing values in training data:")
println(any.(ismissing, eachcol(train_data)))


Checking for missing values in training data:
Bool[0, 0, 0, 0, 0, 0, 0, 0]


In [477]:
println("\nChecking for missing values in test data:")
println(any.(ismissing, eachcol(test_data)))


Checking for missing values in test data:
Bool[0, 0, 0, 0, 0, 0, 0]


In [478]:
function normalize_col!(df, col)
    df[!, col] = (df[!, col] .- mean(df[!, col])) ./ std(df[!, col])
end

normalize_col! (generic function with 1 method)

In [479]:
train_columns_to_normalize = [:annee, :nombre_cylindres, :cylindree, :cylindree_squared, :annee_squared, :log_cylindree, :cylindree_x_nombre_cylindres]

test_columns_to_normalize = names(test_data)

for col in train_columns_to_normalize
    println("Normalizing train column: $col")
    normalize_col!(train_data, col)
end

for col in test_columns_to_normalize
    println("Normalizing test column: $col")
    normalize_col!(test_data, col)
end

Normalizing train column: annee
Normalizing train column: nombre_cylindres
Normalizing train column: cylindree
Normalizing train column: cylindree_squared
Normalizing train column: annee_squared
Normalizing train column: log_cylindree
Normalizing train column: cylindree_x_nombre_cylindres
Normalizing test column: annee
Normalizing test column: nombre_cylindres
Normalizing test column: cylindree
Normalizing test column: cylindree_squared
Normalizing test column: annee_squared
Normalizing test column: log_cylindree
Normalizing test column: cylindree_x_nombre_cylindres


In [480]:
first(train_data, 5)

Row,annee,nombre_cylindres,cylindree,consommation,cylindree_squared,annee_squared,log_cylindree,cylindree_x_nombre_cylindres
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.18235,1.55182,1.24022,13.8359,1.10244,1.18275,1.26218,1.35681
2,0.250279,-0.734771,-0.748147,9.80042,-0.683947,0.249551,-0.749349,-0.708458
3,0.560971,0.408527,0.328884,11.7605,0.117371,0.560465,0.528238,0.187502
4,1.18235,1.55182,1.73731,13.0672,1.75839,1.18275,1.58831,1.72126
5,0.871662,1.55182,1.24022,13.8359,1.10244,0.871532,1.26218,1.35681


### Ne pas rouler les deux prochaines cellules (pour plutard)

In [481]:
function train_val_split(data, val_ratio=0.2, seed=42)
    Random.seed!(seed)  # Set seed for reproducibility
    n = size(data, 1)  # Total number of rows
    indices = shuffle(1:n)  # Shuffle row indices
    
    # Compute split index
    split_idx = round(Int, (1 - val_ratio) * n)
    
    # Split data into training and validation sets
    train_indices = indices[1:split_idx]
    val_indices = indices[split_idx+1:end]
    
    train_set = data[train_indices, :]
    val_set = data[val_indices, :]
    
    return train_set, val_set
end

train_val_split (generic function with 3 methods)

In [482]:
train_split, val_split = train_val_split(train_data, val_ratio=0.2)

LoadError: MethodError: no method matching train_val_split(::DataFrame; val_ratio=0.2)
[0mClosest candidates are:
[0m  train_val_split(::Any) at In[481]:1[91m got unsupported keyword argument "val_ratio"[39m
[0m  train_val_split(::Any, [91m::Any[39m) at In[481]:1[91m got unsupported keyword argument "val_ratio"[39m
[0m  train_val_split(::Any, [91m::Any[39m, [91m::Any[39m) at In[481]:1[91m got unsupported keyword argument "val_ratio"[39m

### --------------------------------------------------------------------------------------------------------------------------

In [483]:
X_train_full = Matrix(select(train_data, Not(:consommation)))
y_train_full = train_data[:, :consommation]
X_test = Matrix(test_data)

150×7 Matrix{Float64}:
 -1.53728  -0.769722  -0.372358    …  -1.53609  -0.222061  -0.576919
 -1.53728  -0.769722  -0.372358       -1.53609  -0.222061  -0.576919
 -1.53728  -0.769722  -0.372358       -1.53609  -0.222061  -0.576919
 -1.53728  -0.769722  -0.739816       -1.53609  -0.753875  -0.713017
 -1.53728   1.45063    2.05287        -1.53609   1.78363    1.90006
 -1.53728   1.45063    1.46493     …  -1.53609   1.42991    1.46455
 -1.53728   1.45063    1.46493        -1.53609   1.42991    1.46455
 -1.53728  -0.769722  -0.44585        -1.53609  -0.319351  -0.604138
 -1.53728   0.340454   0.362559       -1.53609   0.579848   0.171619
 -1.53728   2.56081    1.61192        -1.53609   1.52338    2.28114
 -1.53728   2.56081    1.61192     …  -1.53609   1.52338    2.28114
 -1.53728   3.67098    2.12636        -1.53609   1.82437    3.56046
 -1.53728  -0.769722  -0.739816       -1.53609  -0.753875  -0.713017
  ⋮                                ⋱             ⋮         
  1.48798  -0.769722  -1.0

In [484]:
function bayesian_ridge_regression(X, y, alpha, lambda)
    n, p = size(X)
    I_matrix = Matrix(I, p, p)
    Sigma = inv(X' * X + (alpha / lambda) * I_matrix)
    mu = Sigma * X' * y
    return mu, Sigma
end

bayesian_ridge_regression (generic function with 3 methods)

In [485]:
function add_intercept(X)
    return hcat(ones(size(X, 1)), X)
end

add_intercept (generic function with 1 method)

In [486]:
X_train_full = add_intercept(X_train_full)
X_test = add_intercept(X_test)

150×8 Matrix{Float64}:
 1.0  -1.53728  -0.769722  -0.372358    …  -1.53609  -0.222061  -0.576919
 1.0  -1.53728  -0.769722  -0.372358       -1.53609  -0.222061  -0.576919
 1.0  -1.53728  -0.769722  -0.372358       -1.53609  -0.222061  -0.576919
 1.0  -1.53728  -0.769722  -0.739816       -1.53609  -0.753875  -0.713017
 1.0  -1.53728   1.45063    2.05287        -1.53609   1.78363    1.90006
 1.0  -1.53728   1.45063    1.46493     …  -1.53609   1.42991    1.46455
 1.0  -1.53728   1.45063    1.46493        -1.53609   1.42991    1.46455
 1.0  -1.53728  -0.769722  -0.44585        -1.53609  -0.319351  -0.604138
 1.0  -1.53728   0.340454   0.362559       -1.53609   0.579848   0.171619
 1.0  -1.53728   2.56081    1.61192        -1.53609   1.52338    2.28114
 1.0  -1.53728   2.56081    1.61192     …  -1.53609   1.52338    2.28114
 1.0  -1.53728   3.67098    2.12636        -1.53609   1.82437    3.56046
 1.0  -1.53728  -0.769722  -0.739816       -1.53609  -0.753875  -0.713017
 ⋮                   

In [487]:
function rmse(y_true, y_pred)
    return sqrt(mean((y_true .- y_pred).^2))
end

rmse (generic function with 1 method)

In [488]:
function grid_search_lambda(X, y, lambdas, alpha=1.0)
    Random.seed!(42)
    n = size(X, 1)
    shuffle_indices = shuffle(1:n)
    train_indices = shuffle_indices[1:round(Int, 0.8 * n)]
    val_indices = shuffle_indices[round(Int, 0.8 * n)+1:end]
    
    X_train, X_val = X[train_indices, :], X[val_indices, :]
    y_train, y_val = y[train_indices], y[val_indices]
    
    best_lambda = nothing
    best_rmse = Inf
    
    for lambda in lambdas
        weights, _ = bayesian_ridge_regression(X_train, y_train, alpha, lambda)
        y_val_pred = X_val * weights
        current_rmse = rmse(y_val, y_val_pred)
        
        println("Lambda: $lambda, Validation RMSE: $current_rmse")
        
        if current_rmse < best_rmse
            best_rmse = current_rmse
            best_lambda = lambda
        end
    end
    
    return best_lambda, best_rmse
end

grid_search_lambda (generic function with 2 methods)

In [489]:
lambda_values = 10.0 .^ (-3:3)  # 0.001 à 1000
best_lambda, best_rmse = grid_search_lambda(X_train_full, y_train, lambda_values)

Lambda: 0.001, Validation RMSE: 7.937504325040606
Lambda: 0.01, Validation RMSE: 2.6235362347281965
Lambda: 0.1, Validation RMSE: 0.904471676100136
Lambda: 1.0, Validation RMSE: 0.853522053104985
Lambda: 10.0, Validation RMSE: 0.8577469274709668
Lambda: 100.0, Validation RMSE: 0.8680024584975009
Lambda: 1000.0, Validation RMSE: 0.8694264565852825


(1.0, 0.853522053104985)

In [490]:
println("Best Lambda: $best_lambda, Best Validation RMSE: $best_rmse")

Best Lambda: 1.0, Best Validation RMSE: 0.853522053104985


In [491]:
weights, _ = bayesian_ridge_regression(X_train_full, y_train, 1.0, best_lambda)

([10.330713243212639, -0.03733339564915882, 0.5003837042462584, -0.3402720426078439, 0.13697892615428597, -0.010830104443310382, 1.2724296802188235, 0.32585206973898045], [0.0025188916876574307 -2.3291916895483433e-14 … -5.337265300495235e-17 7.674454250901745e-18; -2.329191689548344e-14 0.5006308879002168 … 0.0013446657220853452 0.0007385451335859085; … ; -5.3372653004938805e-17 0.0013446657220850412 … 0.16257729416520716 0.06113710282925697; 7.674454250828926e-18 0.0007385451335874677 … 0.061137102829258175 0.2786957340061925])

In [492]:
function predict(X, weights)
    return X * weights
end

predict (generic function with 1 method)

In [493]:
y_test_pred = predict(X_test, weights)

150-element Vector{Float64}:
  9.613132669680718
  9.613132669680718
  9.613132669680718
  8.986694097962474
 13.628597287744025
 13.119796092522813
 13.119796092522813
  9.498848057404205
 11.264008046993158
 14.037911932852102
 14.037911932852102
 15.32332479515742
  8.986694097962474
  ⋮
  8.209370561809433
 10.944880431908869
 12.95789552469071
  8.824793530130368
  8.824793530130368
  8.19317239036207
  8.824793530130368
  8.824793530130368
 10.649195317095266
 10.649195317095266
  7.698382586653902
  8.824793530130368

In [494]:
output = DataFrame(id = 1:size(y_test_pred, 1), consommation = y_test_pred)
CSV.write("final_predictions.csv", output)
println("Predictions have been saved to 'final_predictions.csv'")

Predictions have been saved to 'final_predictions.csv'
