## REGRESSION

### Librairies utilisées

In [76]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet


### Fonctions utilisés dans le notebook

In [4]:
def alerteCombinaison(dictionary_param):
    """
    Cette fonction est utilisée pour calculer le nombre de combinaison
    défini des GridSearch pour optimiser les paramtètres des modèles.
    Elle prend en entrée un dictionnaire et retourne le nombre de combinaison
    """
    from numpy import prod
    lengths = [len(v) for v in dictionary_param.values()]
    count = prod(lengths)
    if count < 500:
        print("C'est bon tu peux envoyer l'apprentissage !")
    elif count < 1000:
        print("Tu peux lire tes mails pendant l'apprentissage !")
    elif count < 2000:
        print("Tu peux répondre à tes mails pendant l'apprentissage !")
    else:
        print("Tu peux mettre ton PC en veille et revenir demain !")

    print("Nombre de combinaison :")
    return(count)

### Importation des données

In [17]:
with open('dataframe2.pkl', 'rb') as file:
    df = pickle.load(file)

# Maintenant, df contient la DataFrame importée depuis le fichier
df

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,Type de voie,Voie,Code postal,Commune,Code commune,Section,...,1er lot,Surface Carrez du 1er lot,Nombre de lots,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,col_concat,Moyenne Taux Chomage
0,1,04/01/2021,Vente,204332.0,ALL,DES ECUREUILS,01,BUELLAS,65,B,...,,,0,Maison,88.0,4.0,S,866.0,04/01/2021ALL7.00276DES ECUREUILS1310.0BUELLAS,6.10000
1,2,04/01/2021,Vente,226700.0,CHE,DU MOULIN DE POLAIZE,01,POLLIAT,301,AA,...,,,0,Maison,96.0,3.0,,,04/01/2021CHE173.00164DU MOULIN DE POLAIZE1310...,6.10000
2,1,08/01/2021,Vente,185000.0,RUE,DES GRANGES BONNET,01,PERONNAS,289,AD,...,,,0,Maison,100.0,4.0,S,703.0,08/01/2021RUE46.00161DES GRANGES BONNET1960.0P...,6.10000
3,1,07/01/2021,Vente,114500.0,RUE,DE LA MAIRIE,01,FOISSIAT,163,AB,...,,,0,Maison,85.0,2.0,S,87.0,07/01/2021RUE179.00110DE LA MAIRIE1340.0FOISSIAT,6.10000
4,1,08/01/2021,Vente,145000.0,IMP,DE CHAMANDRE,01,FOISSIAT,163,WC,...,,,0,Maison,92.0,1.0,S,2480.0,08/01/2021IMP8.00255DE CHAMANDRE1340.0FOISSIAT,6.10000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765379,1,27/12/2018,Vente,1800.0,PL,DES VOSGES,75,PARIS 04,104,AO,...,109.0,,1,Appartement,20.0,2.0,,,27/12/2018PL9.09917DES VOSGES75004.0PARIS 04,6.63125
2765380,1,28/12/2018,Vente,405000.0,RUE,BEAUTREILLIS,75,PARIS 04,104,AQ,...,16.0,33.87,2,Appartement,34.0,1.0,,,28/12/2018RUE13.00797BEAUTREILLIS75004.0PARIS 04,6.63125
2765381,1,26/12/2018,Vente,220000.0,RUE,DES LIONS SAINT PAUL,75,PARIS 04,104,AQ,...,126.0,,2,Appartement,29.0,1.0,,,26/12/2018RUE14.05702DES LIONS SAINT PAUL75004...,6.63125
2765382,1,03/12/2018,Vente,383000.0,RUE,POISSONNIERE,75,PARIS 02,102,AO,...,9.0,34.78,1,Appartement,34.0,1.0,,,03/12/2018RUE12.07561POISSONNIERE75002.0PARIS 02,6.63125


### Echantillonage

In [18]:
# variables_explicatives = ['Type local', 'Nombre pieces principales', 'Surface reelle bati', 'Surface terrain',
#                           'Nombre de lots', 'Code commune', 'Nature mutation']
variables_explicatives = ['Type local', 'Nombre pieces principales', 'Surface reelle bati', 
                          'Surface terrain', 'Nombre de lots', 'Moyenne Taux Chomage']


X = df[variables_explicatives]
X = pd.get_dummies(data=X, columns=['Type local'])
X

Unnamed: 0,Nombre pieces principales,Surface reelle bati,Surface terrain,Nombre de lots,Moyenne Taux Chomage,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
0,4.0,88.0,866.0,0,6.10000,False,False,False,True
1,3.0,96.0,,0,6.10000,False,False,False,True
2,4.0,100.0,703.0,0,6.10000,False,False,False,True
3,2.0,85.0,87.0,0,6.10000,False,False,False,True
4,1.0,92.0,2480.0,0,6.10000,False,False,False,True
...,...,...,...,...,...,...,...,...,...
2765379,2.0,20.0,,1,6.63125,True,False,False,False
2765380,1.0,34.0,,2,6.63125,True,False,False,False
2765381,1.0,29.0,,2,6.63125,True,False,False,False
2765382,1.0,34.0,,1,6.63125,True,False,False,False


In [19]:
# notre target
Y = df['Valeur fonciere']
Y

0          204332.0
1          226700.0
2          185000.0
3          114500.0
4          145000.0
             ...   
2765379      1800.0
2765380    405000.0
2765381    220000.0
2765382    383000.0
2765383     45000.0
Name: Valeur fonciere, Length: 2765384, dtype: float64

In [20]:
# on remplace les NA dans la colonne 'Nombre pieces principales' et dans 'Surface reelle bati'
# et dans 'Surface terrain' par leurs moyennes
X['Nombre pieces principales'].fillna(X['Nombre pieces principales'].mean(), inplace=True)
X['Surface reelle bati'].fillna(X['Surface reelle bati'].mean(), inplace=True)
X['Surface terrain'].fillna(X['Surface terrain'].mean(), inplace=True)
X['Nombre de lots'].fillna(X['Nombre de lots'].mean(), inplace=True)
X['Moyenne Taux Chomage'].fillna(X['Moyenne Taux Chomage'].mean(), inplace=True)

In [21]:
pourcentage_manquant = (X.isna().sum() / len(X)) * 100
pourcentage_manquant

Nombre pieces principales                              0.0
Surface reelle bati                                    0.0
Surface terrain                                        0.0
Nombre de lots                                         0.0
Moyenne Taux Chomage                                   0.0
Type local_Appartement                                 0.0
Type local_Dépendance                                  0.0
Type local_Local industriel. commercial ou assimilé    0.0
Type local_Maison                                      0.0
dtype: float64

In [22]:
from sklearn.model_selection import train_test_split
#70% des données pour l’apprentissage
#30% des données pour l'échantillon test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 42)

Pour ne pas donner plus d'importance aux variables explicatives à forte variance, il est essentiel de centrer et réduire les données en amont. On centre et réduit également afin de les ramener à la même échelle 

On centre et réduit les données d'apprentissage

In [51]:
# Appliquez la standardisation aux données de X
scaler = StandardScaler()
X_train_CR = scaler.fit_transform(X_train)
X_test_CR = scaler.fit_transform(X_test)
pd.DataFrame(X_train_CR, columns=X_test.columns).head(3)

Unnamed: 0,Nombre pieces principales,Surface reelle bati,Surface terrain,Nombre de lots,Moyenne Taux Chomage,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
0,-0.449198,-0.120224,0.000918,0.426909,-0.627619,1.645697,-0.403605,-0.229434,-1.084017
1,0.57841,0.085773,0.194354,-0.695669,0.300481,-0.607645,-0.403605,-0.229434,0.922494
2,0.064606,0.24027,-0.119086,-0.695669,-0.518431,-0.607645,-0.403605,-0.229434,0.922494


### Régression linéaire multiple

#### Apprentissage

On lance l'apprentissage du modèle sur l'échantillon d'entrainement

In [48]:
y_train

1483835     80000.0
1211934    239100.0
2529944    125000.0
2760936     23000.0
1712318    177800.0
             ...   
110268     100000.0
1692743     81000.0
2356330    107000.0
2229084      8500.0
2219110     55000.0
Name: Valeur fonciere, Length: 1935768, dtype: float64

In [10]:
lm = LinearRegression()
model_LinearRegression = lm.fit(X_train_CR,y_train)

### Test

In [11]:
y_pred = model_LinearRegression.predict(X_test_CR)
y_pred

array([ 2.87033067e+10,  2.87033295e+10,  2.87033304e+10, ...,
       -2.76069744e+10,  2.87033533e+10, -2.76070289e+10])

In [12]:
#on multiplie par 0.3 pour avoir un résultat en metre
mean_squared_error(y_test, y_pred, squared=False)

60598593738.452576

On s'interesse aux coefficients de la régression

In [13]:
coef = pd.DataFrame(lm.coef_ ,index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = lm.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44674.7
Nombre de lots,7175.25
Surface reelle bati,3514.173
Surface terrain,2146.642
Type local_Appartement,59048790000000.0
Type local_Dépendance,46168030000000.0
Type local_Local industriel. commercial ou assimilé,28967320000000.0
Type local_Maison,66290500000000.0
Constante,164101.3


### Régression Ridge

#### Sans GridSearch

In [30]:
ridge_model = Ridge(alpha=10)
ridge_model = ridge_model.fit(X_train_CR,y_train)

y_pred = ridge_model.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 100692.43715790247


#### Avec GridSearch

In [62]:
parameters = {'alpha': np.arange(start = 0, stop = 10, step = 0.5)}

ridge_model = Ridge()
grid_ridge = GridSearchCV(ridge_model, parameters, scoring = 'neg_mean_squared_error')
grid_ridge.fit(pd.DataFrame(X_train_CR, columns=X_train.columns), y_train)
# grid_ridge.fit(X_train_CR, y_train)

In [64]:
print(pd.DataFrame(grid_ridge.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", grid_ridge.best_params_)
print("Meilleur score :", grid_ridge.best_score_)
y_pred = grid_ridge.best_estimator_.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

            params  mean_test_score
0   {'alpha': 0.0}    -1.013721e+10
1   {'alpha': 0.5}    -1.013721e+10
2   {'alpha': 1.0}    -1.013721e+10
3   {'alpha': 1.5}    -1.013721e+10
4   {'alpha': 2.0}    -1.013721e+10
5   {'alpha': 2.5}    -1.013721e+10
6   {'alpha': 3.0}    -1.013721e+10
7   {'alpha': 3.5}    -1.013721e+10
8   {'alpha': 4.0}    -1.013721e+10
9   {'alpha': 4.5}    -1.013721e+10
10  {'alpha': 5.0}    -1.013721e+10
11  {'alpha': 5.5}    -1.013721e+10
12  {'alpha': 6.0}    -1.013721e+10
13  {'alpha': 6.5}    -1.013721e+10
14  {'alpha': 7.0}    -1.013721e+10
15  {'alpha': 7.5}    -1.013721e+10
16  {'alpha': 8.0}    -1.013721e+10
17  {'alpha': 8.5}    -1.013721e+10
18  {'alpha': 9.0}    -1.013721e+10
19  {'alpha': 9.5}    -1.013721e+10
Meilleur paramètre : {'alpha': 9.5}
Meilleur score : -10137213138.263279
RMSE : 100692.43711051517




Coefficients du modèle

In [67]:
coef = pd.DataFrame(ridge_model.best_estimator_.coef_ ,
                    index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = ridge_model.best_estimator_.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44685.529907
Nombre de lots,7259.775261
Surface reelle bati,3547.56684
Surface terrain,1939.908889
Type local_Appartement,-2941.588973
Type local_Dépendance,2948.759634
Type local_Local industriel. commercial ou assimilé,10912.957287
Type local_Maison,-4202.114504
Constante,164101.304014


### Régression Lasso

In [31]:
lasso_model = Lasso(alpha=5)
lasso_model = lasso_model.fit(X_train_CR,y_train)

y_pred = lasso_model.predict(X_test_CR)

print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 100692.4351826682


In [16]:
coef = pd.DataFrame(lasso_model.coef_ ,
                    index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = lasso_model.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44632.33924
Nombre de lots,7128.452419
Surface reelle bati,3518.184656
Surface terrain,2140.86676
Type local_Appartement,-0.0
Type local_Dépendance,5221.526143
Type local_Local industriel. commercial ou assimilé,12364.257885
Type local_Maison,-641.661277
Constante,164101.304014


### Avec GridSearch

In [19]:
parameters = {'alpha': np.arange(start = 0, stop = 10, step = 1)}
print(alerteCombinaison(parameters))
lasso_model = Lasso()
lasso_model = GridSearchCV(lasso_model, parameters, scoring = 'r2', verbose = 2)
lasso_model.fit(X_train_CR, y_train)

C'est bon tu peux envoyer l'apprentissage !
Nombre de combinaison :
10
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(


Lasso était beaucoup trop long, et faisait même planter mon PC.

### Régression Elasticnet

#### Sans GridsearchCV

In [19]:
elastic_model = ElasticNet(alpha=5, l1_ratio=0.2)
elastic_model = elastic_model.fit(X_train_CR,y_train)

y_pred = elastic_model.predict(X_test_CR)

print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 104905.91176840823


#### Avec GridSearch

In [67]:
parameters = {'alpha' : np.arange(0,10,1),
             'l1_ratio' : np.arange(0,1,0.2)}

elastic_model = ElasticNet()
grid_elasticnet = GridSearchCV(elastic_model, parameters, scoring = 'neg_mean_squared_error', verbose=1, cv=2)
grid_elasticnet.fit(pd.DataFrame(X_train_CR, columns=X_train.columns), y_train)
# grid_ridge.fit(X_train_CR, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_f

In [69]:
print(pd.DataFrame(grid_elasticnet.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", grid_elasticnet.best_params_)
print("Meilleur score :", grid_elasticnet.best_score_)
y_pred = grid_elasticnet.best_estimator_.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

                                          params  mean_test_score
0                  {'alpha': 0, 'l1_ratio': 0.0}    -1.013666e+10
1                  {'alpha': 0, 'l1_ratio': 0.2}    -1.013666e+10
2                  {'alpha': 0, 'l1_ratio': 0.4}    -1.013666e+10
3   {'alpha': 0, 'l1_ratio': 0.6000000000000001}    -1.013666e+10
4                  {'alpha': 0, 'l1_ratio': 0.8}    -1.013666e+10
5                  {'alpha': 1, 'l1_ratio': 0.0}    -1.054198e+10
6                  {'alpha': 1, 'l1_ratio': 0.2}    -1.048324e+10
7                  {'alpha': 1, 'l1_ratio': 0.4}    -1.041489e+10
8   {'alpha': 1, 'l1_ratio': 0.6000000000000001}    -1.033274e+10
9                  {'alpha': 1, 'l1_ratio': 0.8}    -1.023094e+10
10                 {'alpha': 2, 'l1_ratio': 0.0}    -1.075370e+10
11                 {'alpha': 2, 'l1_ratio': 0.2}    -1.068124e+10
12                 {'alpha': 2, 'l1_ratio': 0.4}    -1.059362e+10
13  {'alpha': 2, 'l1_ratio': 0.6000000000000001}    -1.048328e+10
14        



### Arbre de décision

In [46]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state=42)
tree_regressor.fit(X_train_CR, y_train)
y_pred = tree_regressor.predict(X_test_CR)
mse = mean_squared_error(y_test, y_pred)
# Affichez le MSE pour évaluer la performance du modèle
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 12777387809.924845


#### Avec GridSearch

In [71]:
from sklearn.tree import DecisionTreeRegressor

tree_regressor = DecisionTreeRegressor()
param_grid = {
    'max_depth': [None, 10, 20, 30],  # Profondeur maximale de l'arbre
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons requis pour diviser un nœud
    'min_samples_leaf': [1, 2, 4]  # Nombre minimal d'échantillons requis dans une feuille
}
grid_tree_regressor = GridSearchCV(tree_regressor, param_grid=param_grid, scoring="neg_mean_squared_error")

grid_tree_regressor.fit(pd.DataFrame(X_train_CR, columns=X_train.columns), y_train)

In [75]:
# print(pd.DataFrame(tree_regressor.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", grid_tree_regressor.best_params_)
print("Meilleur score :", grid_tree_regressor.best_score_)

y_pred = grid_tree_regressor.predict(pd.DataFrame(X_test_CR, columns=X_test.columns))
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))


Meilleur paramètre : {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}
Meilleur score : -6680644535.062864
RMSE : 84197.18561422486


### Random forest

In [56]:
random_forest_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_regressor.fit(X_train, y_train)
y_pred = random_forest_regressor.predict(X_test)

                                               params  mean_test_score
0   {'max_depth': None, 'min_samples_leaf': 1, 'mi...        -0.054894
1   {'max_depth': None, 'min_samples_leaf': 1, 'mi...        -0.009273
2   {'max_depth': None, 'min_samples_leaf': 1, 'mi...         0.046871
3   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.022037
4   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.028152
5   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.064082
6   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.083782
7   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.083764
8   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.088451
9   {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187218
10  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187263
11  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187304
12  {'max_depth': 10, 'min_samples_leaf': 2, 'min_...         0.187289
13  {'

In [77]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_forest_regressor = RandomForestRegressor()
grid_random_forest = GridSearchCV(random_forest_regressor, param_grid=param_grid, scoring="neg_mean_squared_error")
grid_random_forest.fit(pd.DataFrame(X_train_CR, columns=X_train.columns), y_train)

KeyboardInterrupt: 

Le modèle a tourné pendant plus de 2h sans résultat.

In [None]:
# print(pd.DataFrame(tree_regressor.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", grid_random_forest.best_params_)
print("Meilleur score :", grid_random_forest.best_score_)

y_pred = grid_random_forest.predict(pd.DataFrame(X_test_CR, columns=X_test.columns))
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))


# Kaggle

In [20]:
!kaggle datasets list

ref                                                        title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
iamsouravbanerjee/customer-shopping-trends-dataset         Customer Shopping Trends Dataset                  146KB  2023-10-05 06:45:37           4688        116  1.0              
nelgiriyewithana/top-spotify-songs-2023                    Most Streamed Spotify Songs 2023                   47KB  2023-08-26 11:04:57          31838        961  1.0              
nelgiriyewithana/credit-card-fraud-detection-dataset-2023  Credit Card Fraud Detection Dataset 2023          143MB  2023-09-18 10:00:19           5456        185  1.0              
nelgiriyewithana/billionaires-statistics-dataset           Billionaires Statistics Dataset (202

In [21]:
!kaggle competitions download -c m2-sise-2023

Downloading m2-sise-2023.zip to c:\Users\bourh\Cours\CoursM2SISE\Machine_Learning_Python\ProjetPython




  0%|          | 0.00/295M [00:00<?, ?B/s]
  0%|          | 1.00M/295M [00:00<03:25, 1.50MB/s]
  1%|          | 3.00M/295M [00:00<01:05, 4.69MB/s]
  2%|▏         | 6.00M/295M [00:00<00:32, 9.33MB/s]
  3%|▎         | 10.0M/295M [00:01<00:19, 15.7MB/s]
  5%|▍         | 14.0M/295M [00:01<00:15, 19.0MB/s]
  6%|▌         | 18.0M/295M [00:01<00:12, 23.7MB/s]
  7%|▋         | 21.0M/295M [00:01<00:12, 22.8MB/s]
  8%|▊         | 24.0M/295M [00:01<00:11, 24.5MB/s]
  9%|▉         | 27.0M/295M [00:01<00:12, 22.0MB/s]
 11%|█         | 31.0M/295M [00:01<00:13, 20.6MB/s]
 12%|█▏        | 35.0M/295M [00:02<00:11, 24.1MB/s]
 13%|█▎        | 38.0M/295M [00:02<00:12, 21.4MB/s]
 14%|█▍        | 41.0M/295M [00:02<00:16, 16.2MB/s]
 15%|█▍        | 43.0M/295M [00:02<00:21, 12.6MB/s]
 15%|█▌        | 45.0M/295M [00:03<00:23, 11.0MB/s]
 17%|█▋        | 49.0M/295M [00:03<00:16, 15.5MB/s]
 18%|█▊        | 52.0M/295M [00:03<00:15, 16.5MB/s]
 19%|█▊        | 55.0M/295M [00:03<00:13, 19.0MB/s]
 20%|█▉        | 59.

In [28]:
import zipfile
with zipfile.ZipFile("./content/m2-sise-2023.zip", 'r') as zip_ref:
    zip_ref.extractall("./content")
    