In [24]:
# Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report

from sklearn.metrics import roc_auc_score, plot_roc_curve, roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve

from sklearn.model_selection import learning_curve

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
#import warnings
#warnings.simplefilter(action="ignore")

# Baseline model

In [28]:
immo = pd.read_csv("immo.csv")

In [3]:
# define X, y
y = immo["ocean_proximity_number"]
X = immo.drop("ocean_proximity_number", axis=1)

In [4]:
# instantiate Dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")

# fit the modem
dummy_clf.fit(X, y)

# calculate the score
dummy_clf.score(X, y)

0.43564292201894383

---Now we can see the score of my baseline: 0.43---

In [5]:
# instantiate Dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")

# fit the modem
dummy_clf.fit(X, y)

# calculate the score
dummy_clf.score(X, y)

0.43564292201894383

---We can see a little improvement with the dataset : immo_with_outliers_and_imput and immo_with_outliers---

---for now i'll keep the outliers---

# First iteration

In [6]:
immo_with_imput = pd.read_csv("immo_with_imput.csv")

In [4]:
def Lineareg(data):
    y = data["median_house_value"]
    X = data.drop("median_house_value", axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)
    model = LinearRegression()
    model.fit(X_train, y_train)
    print(f'model score: {model.score(X_test, y_test)}')

    K = []
    total_time = []
    score = []
    for k in range(2,20):
        cross_val_results = cross_validate(model, X, y, cv=k)
        total_time.append(sum(cross_val_results['fit_time'])+sum(cross_val_results['score_time']))
        K.append(k)
        score.append(cross_val_results['test_score'].mean())
    best_cv = pd.DataFrame({'K': K,'score': score})
    cross = best_cv.query(f'score=={max(score)}')['K'].item()
    cv_results = cross_validate(model,X, y,cv=cross, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))
    r2 = cv_results['test_r2'].mean()
    rmse = cv_results['test_neg_root_mean_squared_error'].mean()
    print(f'r2: {r2}')
    print(f'rmse: {rmse}')

In [8]:
Lineareg(immo_with_imput)

model score: 0.6058961194280346
r2: 0.6016233887270823
rmse: -56747.678579654465


# Second iteration

In [9]:
immo_with_outliers = pd.read_csv("immo_with_outliers.csv")
immo_with_outliers_and_imput = pd.read_csv("immo_with_outliers_and_imput.csv")

In [10]:
Lineareg(immo_with_outliers)

model score: 0.6324224669228723
r2: 0.6353530950356457
rmse: -69539.75736337178


In [11]:
Lineareg(immo_with_outliers_and_imput)

model score: 0.6324224669228723
r2: 0.6353530950356457
rmse: -69539.75736337178


# Fourth iteration

In [15]:
one_hot_iteration = pd.read_csv("one_hot_iteration.csv")

In [17]:
Lineareg(one_hot_iteration)

model score: 0.6449267551867998
r2: 0.6465210485347069
rmse: -68484.3730358486


# Fifth iteration 

In [18]:
by_mean_imput = pd.read_csv("by_mean_imput.csv")

In [19]:
Lineareg(by_mean_imput)

model score: 0.6449807236042429
r2: 0.6466023555655033
rmse: -68476.61129675338


# Sixth iteration

In [8]:
data_scale = pd.read_csv("data_scale.csv")

In [9]:
Lineareg(data_scale)

model score: 0.6449807236042486
r2: 0.6466023555655057
rmse: -68476.61129675315


# Seventh iteration

In [10]:
data_0 = pd.read_csv("data_0.csv")

In [11]:
Lineareg(data_0)

model score: 0.6442060028166736
r2: 0.6457534140213687
rmse: -68533.39384940882


---i can see my model's going bad with the imputation by 0, i'm running out of time, and i'm going to choose my 6th iteration---

# Pipeline

In [25]:
def Iteration_scaler(data):
    #imputation
    mean = data['total_bedrooms'].mean()
    data = data.fillna(mean)
    #encoding
    enc = OneHotEncoder(sparse=False).fit(data[['ocean_proximity']])
    encoded = enc.transform(data[['ocean_proximity']])
    encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out())
    data_df = pd.concat([data, encoded_df], axis = 1)
    encoding_df = data_df.drop(['ocean_proximity'], axis = 1)
    #scaler
    scaler_df = encoding_df.drop(['median_house_value','Unnamed: 0','longitude','latitude','ocean_proximity_<1H OCEAN','ocean_proximity_INLAND','ocean_proximity_ISLAND','ocean_proximity_NEAR BAY','ocean_proximity_NEAR OCEAN'],axis = 1)
    scaler = StandardScaler()
    scaler.fit(scaler_df)
    scale = scaler.transform(scaler_df)
    scale_to_df = pd.DataFrame(scale)
    name_scale_columns = scale_to_df.rename(columns={0:'housing_median_age', 1:'total_rooms',2:'total_bedrooms',3:'population',4:'households',5:'median_income'}, errors='raise')
    concat_scale_df = pd.concat([encoding_df[['Unnamed: 0','longitude','latitude','ocean_proximity_<1H OCEAN','ocean_proximity_INLAND','ocean_proximity_ISLAND','ocean_proximity_NEAR BAY','ocean_proximity_NEAR OCEAN','median_house_value']], name_scale_columns], axis = 1)
    #drop = concat_scale_df.drop(['ocean_proximity'], axis = 1)
    return concat_scale_df

In [26]:
def Pipeline(data):
    return Lineareg(Iteration_scaler(data))