### Concept:

Logarithmic regression is a type of regression model that uses a logarithmic transformation of the independent variable(s) to model a non-linear relationship. It is useful when the rate of change of the dependent variable decreases as the independent variable increases, following a logarithmic pattern.

The model equation for logarithmic regression with one or more independent variables can be expressed as:

$$
y = \beta_0 + \beta_1 \log(x_1) + \beta_2 \log(x_2) + \dots + \beta_n \log(x_n) + \epsilon
$$

Where:
$$
\begin{array}{ll}
y & \text{is the dependent variable.} \\
x_1, x_2, \dots, x_n & \text{are the independent variables.} \\
\beta_0 & \text{is the intercept.} \\
\beta_1, \beta_2, \dots, \beta_n & \text{are the coefficients corresponding to each independent variable.} \\
\epsilon & \text{is the error term.}
\end{array}
$$


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [10]:
# Load the California Housing dataset
california = fetch_california_housing()

# Create a DataFrame with feature names
df = pd.DataFrame(california.data, columns=california.feature_names)
df["Longitude"] = df["Longitude"] + 150 # Shift the longitude values to make all the values positive, so need to minus 150 again to get the original values

X = df[california.feature_names].values
y = california.target

# Apply logarithmic transformation to the independent variable
X_log = np.log(X)

# Train a Random Forest model
model = RandomForestRegressor()
model.fit(X_log, y)

# Get feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': california.feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

      Feature  Importance
0      MedInc    0.519042
5    AveOccup    0.136687
6    Latitude    0.094198
7   Longitude    0.092967
1    HouseAge    0.053350
2    AveRooms    0.043326
4  Population    0.031051
3   AveBedrms    0.029379


In [11]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,30.430296
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,25.65
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,28.2
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,31.51
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,31.99
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,35.69


In [18]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,27.77
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,27.78
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,27.76
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,27.75
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,27.75
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,28.91
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,28.79
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,28.78
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,28.68


In [17]:
from sklearn.preprocessing import StandardScaler
#  Feature importance order from the image
features_order = ['MedInc', 'AveOccup', 'Latitude', 'Longitude', 'HouseAge', 'AveRooms', 'Population', 'AveBedrms']

# Scale the features
scaler = StandardScaler()

# Create a Linear Regression model
model = LinearRegression()

# Initialize an empty list to collect results
results = []

for i in range(1, len(features_order) + 1):
    # Select top i features
    selected_features = features_order[:i]
    X_selected = df[selected_features].values
    X_log_selected = np.log(X_selected)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_log_selected, y, test_size=0.3, random_state=42)
    
    # Train the model on the training set with selected features
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Collect the results
    results.append({'num_features': i, 'features': selected_features, 'R2': r2, 'MSE': mse})

# Convert the results to a DataFrame
df_results = pd.DataFrame(results)

df_results.sort_values(by='R2', ascending=False, inplace=True, ignore_index=True)
df_results

Unnamed: 0,num_features,features,R2,MSE
0,8,"[MedInc, AveOccup, Latitude, Longitude, HouseA...",0.601976,0.522423
1,7,"[MedInc, AveOccup, Latitude, Longitude, HouseA...",0.596749,0.529283
2,6,"[MedInc, AveOccup, Latitude, Longitude, HouseA...",0.596722,0.529319
3,5,"[MedInc, AveOccup, Latitude, Longitude, HouseAge]",0.596537,0.529562
4,4,"[MedInc, AveOccup, Latitude, Longitude]",0.586681,0.542498
5,3,"[MedInc, AveOccup, Latitude]",0.494681,0.663252
6,2,"[MedInc, AveOccup]",0.479374,0.683342
7,1,[MedInc],0.427508,0.751419
