# 1. Importation de packages

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 2. Chargement des donnees

In [19]:
data = pd.read_csv('Data/studentscores.csv')

In [20]:
data.head()

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47
2,3.2,27
3,8.5,75
4,3.5,30


# 3. EDA : Exploratory Data Analysis

In [21]:
data.shape

(25, 2)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Hours   25 non-null     float64
 1   Scores  25 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 528.0 bytes


In [23]:
data.isnull().sum()

Hours     0
Scores    0
dtype: int64

# 4. Division des donnees en X et y

In [24]:
data.head(2)

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47


In [25]:
X = data.iloc[ : , :1].values
y = data.iloc[ : , 1].values
X

array([[2.5],
       [5.1],
       [3.2],
       [8.5],
       [3.5],
       [1.5],
       [9.2],
       [5.5],
       [8.3],
       [2.7],
       [7.7],
       [5.9],
       [4.5],
       [3.3],
       [1.1],
       [8.9],
       [2.5],
       [1.9],
       [6.1],
       [7.4],
       [2.7],
       [4.8],
       [3.8],
       [6.9],
       [7.8]])

In [26]:
y

array([21, 47, 27, 75, 30, 20, 88, 60, 81, 25, 85, 62, 41, 42, 17, 95, 30,
       24, 67, 69, 30, 54, 35, 76, 86])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=52)

In [34]:
data.shape

(25, 2)

In [35]:
print("Shape de X-train", X_train.shape)
print("Shape de X-test", X_test.shape)
print("Shape de y-train", y_train.shape)
print("Shape de y-test", y_test.shape)

Shape de X-train (18, 1)
Shape de X-test (7, 1)
Shape de y-train (18,)
Shape de y-test (7,)


# 5. Construction des modeles

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## 5.1 Linear Regression

In [38]:
modele_lineaire_lr = LinearRegression()

In [40]:
modele_lineaire_lr.fit(X_train, y_train)

In [42]:
y_test

array([60, 25, 67, 17, 30, 41, 27])

In [44]:
y_scores_predits = modele_lineaire_lr.predict(X_test)

In [51]:
y_scores_predits

array([56.82526316, 30.18105263, 62.53473684, 14.95578947, 37.79368421,
       47.30947368, 34.93894737])

# Prediction Simple

In [56]:
print("Simple Prediction:", modele_lineaire_lr.predict([[2.5]]))

Simple Prediction: [28.27789474]


# Proof

In [66]:
b0 = modele_lineaire_lr.intercept_
b1 = modele_lineaire_lr.coef_
Score = b0 + b1 * 2.5
print("Score:", Score)

Score: [28.27789474]


In [43]:
from sklearn.metrics import r2_score

In [45]:
r2_score(y_test, y_scores_predits)

0.8944891676032258

# 5.2 Decision Tree

In [47]:
modele_lineaire_dt = DecisionTreeRegressor()
modele_lineaire_dt.fit(X_train, y_train)
y_scores_predits_dt = modele_lineaire_dt.predict(X_test)

In [48]:
r2_score(y_test, y_scores_predits_dt)

0.6401825258354583

# 5.3 Random Forest

In [49]:
modele_lineaire_rf = RandomForestRegressor()
modele_lineaire_rf.fit(X_train, y_train)
y_scores_predits_rf = modele_lineaire_rf.predict(X_test)

In [50]:
r2_score(y_test, y_scores_predits_rf)

0.8228036777781507