In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
cols = {
    'longitude': str,
    'latitude': str,
    'housing_median_age': str,
    'total_rooms': str,
    'total_bedrooms': str,
    'population': str,
    'households': str,
    'median_income': str,
    'ocean_proximity': str,
    'median_house_value' : str
}

In [3]:
df = pd.read_csv('../data/1553768847-housing.csv',dtype=cols )

In [4]:
df.shape

(20640, 10)

In [5]:
df.sample(n=5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
13224,-117.68,34.03,16,2859,668,1946,591,3.0396,INLAND,124300
16486,-121.05,38.14,19,3326,561,1544,511,2.9875,INLAND,166300
7783,-118.06,33.91,36,1360,271,909,275,4.6731,<1H OCEAN,173300
9400,-122.56,37.91,52,1972,327,755,345,7.1924,NEAR BAY,500001
6894,-118.11,34.05,23,3436,565,1729,529,5.9941,<1H OCEAN,266700


In [5]:
label_encoder = LabelEncoder()
df['ocean_proximity'] = label_encoder.fit_transform(df['ocean_proximity'])

df['total_bedrooms'].fillna(0, inplace=True)

In [7]:
df.isnull().mean(axis=0).sort_values(ascending=False)

longitude             0.0
latitude              0.0
housing_median_age    0.0
total_rooms           0.0
total_bedrooms        0.0
population            0.0
households            0.0
median_income         0.0
ocean_proximity       0.0
median_house_value    0.0
dtype: float64

In [6]:
cols = [
    'longitude', 
    'latitude', 
    'housing_median_age', 
    'total_rooms', 
    'total_bedrooms', 
    'population', 
    'households', 
    'median_income', 
    'ocean_proximity', 
    'median_house_value'
]

In [9]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'median_house_value'],
      dtype='object')

In [10]:
df.describe(include=['int64', 'float64'])

Unnamed: 0,ocean_proximity
count,20640.0
mean,1.165843
std,1.420662
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,4.0


In [7]:
df.select_dtypes(include='number').columns

Index(['ocean_proximity'], dtype='object')

In [9]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [10]:

cols = ['total_rooms', 'housing_median_age']
df = df.dropna(subset=cols).reset_index(drop=True)

In [11]:
X, y = df[cols].values, df['median_house_value'].values

In [61]:

# Split the data into training and testing sets
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure valid column names
X_train.columns = [str(col) for col in X_train.columns]

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# Hyperparameters grid search
param_grid = {
    'eta': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 6, 9],
    'lambda': [1, 5]
}

cv_results = []

for eta in param_grid['eta']:
    for max_depth in param_grid['max_depth']:
        for lam in param_grid['lambda']:
            params = {
                'objective': 'reg:squarederror',
                'eval_metric': 'rmse',
                'eta': eta,
                'max_depth': max_depth,
                'alpha': 0.1,
                'lambda': lam
            }

            cv_model = xgb.cv(
                params=params,
                dtrain=dtrain,
                num_boost_round=100,
                nfold=5,
                early_stopping_rounds=10,
                verbose_eval=False
            )

            rmse = cv_model['test-rmse-mean'].iloc[-1]
            cv_results.append({'eta': eta, 'max_depth': max_depth, 'lambda': lam, 'rmse': rmse})

# View the results
cv_results_df = pd.DataFrame(cv_results)
print(cv_results_df)

# Choose the best hyperparameters
best_params = cv_results_df.loc[cv_results_df['rmse'].idxmin()]

# Train the final model
final_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': best_params['eta'],
    'max_depth': int(best_params['max_depth']),
    'alpha': 0.1,
    'lambda': best_params['lambda']
}

final_model = xgb.train(final_params, dtrain, num_boost_round=100)

# Ensure valid column names for the test set
X_test.columns = [str(col) for col in X_test.columns]

# Evaluate the model on the test set
dtest = xgb.DMatrix(X_test)

predictions = final_model.predict(dtest)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f'Root Mean Squared Error on test set: {rmse}')


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:longitude: object, latitude: object, housing_median_age: object, total_rooms: object, total_bedrooms: object, population: object, households: object, median_income: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Définir les hyperparamètres
params = {
    'eta': 0.2,
    'max_depth': 9,
    'lambda': 5,
    'objective': 'reg:squarederror'
}

# Entraîner le modèle XGBoost
num_boost_round = 100
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)

# Évaluer le modèle sur l'ensemble de test
dtest = xgb.DMatrix(data=X_test, label=y_test)
predictions = model.predict(dtest)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f'Root Mean Squared Error on test set: {rmse}')

Root Mean Squared Error on test set: 117507.77326384487


In [14]:
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)


In [22]:
from sklearn.linear_model import LinearRegression

In [31]:
model = LinearRegression()

In [24]:
model.fit(X_train, y_train)

In [25]:
model.fit(X, y)

In [26]:
s = joblib.dump(model, '../../src/models/linear_regression.joblib')

In [27]:
np.array([[1, 2]])

array([[1, 2]])

## Save the model

##### Train on all data

In [57]:
from xgboost import XGBRegressor
import joblib

modelbis = XGBRegressor(colsample_bytree = 0.7, learning_rate= 0.03, max_depth= 10, min_child_weight = 5, n_estimators = 300, nthread= 4, subsample= 0.7)

X, y = df[cols].values, df['median_house_value'].values

modelbis.fit(X, y)

s2 = joblib.dump(modelbis, '../../src/models/xgboost.joblib')


In [53]:
model.predict(X)

array([170459.69, 246850.42, 265261.56, ..., 177779.94, 178927.89,
       235284.55], dtype=float32)

In [15]:

# Créer la matrice DMatrix pour XGBoost avec toutes les données
dtrain_full = xgb.DMatrix(data=df.drop('median_house_value', axis=1), label=df['median_house_value'])

# Définir les hyperparamètres
params = {
    'objective': 'reg:squarederror',
    'eta': 0.2,
    'max_depth': 9,
    'lambda': 5
}

# Nombre d'itérations (boosting rounds)
num_boost_round = 100

# Entraîner le modèle XGBoost sur toutes les données
xgb_model = xgb.train(params=params, dtrain=dtrain_full, num_boost_round=num_boost_round)



ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:longitude: object, latitude: object, housing_median_age: object, total_rooms: object, total_bedrooms: object, population: object, households: object, median_income: object

In [17]:
import pandas as pd
import xgboost as xgb

# Assuming df is your DataFrame

# Convert object columns to numeric
df_numeric = df.apply(pd.to_numeric, errors='coerce')

# Handle missing values if needed
df_numeric.fillna(0, inplace=True)

# Create DMatrix
dtrain_full = xgb.DMatrix(data=df_numeric.drop('median_house_value', axis=1), label=df_numeric['median_house_value'])

# Define hyperparameters
params = {
    'objective': 'reg:squarederror',
    'eta': 0.2,
    'max_depth': 9,
    'lambda': 5
}

# Number of boosting rounds
num_boost_round = 100

# Train the XGBoost model
xgb_model2 = xgb.train(params=params, dtrain=dtrain_full, num_boost_round=num_boost_round)


In [19]:
import joblib

In [20]:
s = joblib.dump(xgb_model2, '../../src/models/xgboost2.joblib')

In [None]:
np.array([[1, 2]])

In [None]:
import os
# Specify the desired working directory path
new_working_directory = '/Users/coraline/Desktop/amse-myapp/amse-myapp-main/src'

# Change the working directory
os.chdir(new_working_directory)

In [None]:
# Entraîner le modèle
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Vous devez définir vos propres hyperparamètres
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.1,
    'max_depth': 3,
    'alpha': 0.1,
    'lambda': 1
}

model = xgb.train(params, xgb.DMatrix(X, label=y), num_boost_round=100)

# Sauvegarder le modèle
model.save_model('../src/models/xgboost_mo.model')


In [None]:
from dash import html, dcc, Input, Output, State, callback
import dash

PreventUpdate = dash.no_update
