In [54]:
#implement forward stepwise selection algorithm for a given 
#california housing dataset
#AIC akaike information criteria

In [55]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Generate some sample data
np.random.seed(42)
x = np.random.rand(100)
y = 2 * x + 1 + np.random.randn(100)

# Create a DataFrame
data = pd.DataFrame({'x': x, 'y': y})

# Fit a linear regression model
X = sm.add_constant(data['x'])
model = sm.OLS(data['y'], X).fit()

# Calculate AIC
aic = model.aic

print(f'Akaike Information Criterion (AIC): {aic}')


Akaike Information Criterion (AIC): 266.2930532523742


In [56]:
import pandas as pd
df = pd.read_csv("housing.csv")
df.head
print(df.columns)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')


In [57]:
df = df.drop(["ocean_proximity"], axis = 1)

In [58]:
has_null = df.isnull().any().any()
print(has_null)

True


In [59]:
for column in df.columns:
    if df[column].isnull().any():
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
has_null = df.isnull().any().any()
print(has_null)

False


In [60]:
from sklearn.model_selection import train_test_split
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [61]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [62]:
import math
import numpy as np
rss = np.sum((y_test - y_pred)**2)
num_params = len(model.coef_) + 1
n = len(y)
aic = n * math.log(rss/n) + 2 * num_params
aic

428392.49115555326

In [63]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

5159038037.897361

In [64]:
9909813751.230467
5159038037.897361

9909813751.230467