In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df = pd.read_csv('housing.csv')

In [11]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [12]:
df = df[(df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND') ]

In [13]:
df.ocean_proximity.unique()

array(['<1H OCEAN', 'INLAND'], dtype=object)

In [14]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [15]:
#q1

df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [16]:
#q2

df.population.describe()

count    15687.000000
mean      1466.317205
std       1180.389908
min          3.000000
25%        802.000000
50%       1195.000000
75%       1777.000000
max      35682.000000
Name: population, dtype: float64

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [27]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample DataFrame with specified columns
columns_to_keep = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']

# Assuming df is your DataFrame with the filtered data
df = df[columns_to_keep]

# Shuffle the dataset with seed 42
seed = 42
df_shuffled = df.sample(frac=1, random_state=seed)

# Split the data into train/val/test sets (60%/20%/20%)
train_size = 0.6
val_size = 0.2
test_size = 0.2

train, test = train_test_split(df_shuffled, test_size=(val_size + test_size), random_state=seed)
val, test = train_test_split(test, test_size=test_size/(val_size + test_size), random_state=seed)

# Apply log transformation to the 'median_house_value' variable
target_column = 'median_house_value'
train[target_column] = np.log1p(train[target_column])
val[target_column] = np.log1p(val[target_column])
test[target_column] = np.log1p(test[target_column])

# Function to train a linear regression model and calculate RMSE
def train_and_evaluate(X_train, y_train, X_val, y_val):
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    return rmse

# Case 1: Filling missing values with 0
train_0 = train.copy()
val_0 = val.copy()

train_0['total_bedrooms'].fillna(0, inplace=True)
val_0['total_bedrooms'].fillna(0, inplace=True)

X_train_0 = train_0.drop(target_column, axis=1)
y_train_0 = train_0[target_column]
X_val_0 = val_0.drop(target_column, axis=1)
y_val_0 = val_0[target_column]

rmse_0 = train_and_evaluate(X_train_0, y_train_0, X_val_0, y_val_0)

# Case 2: Filling missing values with the mean (using training data only)
mean_value = train['total_bedrooms'].mean()
train_mean = train.copy()
val_mean = val.copy()

train_mean['total_bedrooms'].fillna(mean_value, inplace=True)
val_mean['total_bedrooms'].fillna(mean_value, inplace=True)

X_train_mean = train_mean.drop(target_column, axis=1)
y_train_mean = train_mean[target_column]
X_val_mean = val_mean.drop(target_column, axis=1)
y_val_mean = val_mean[target_column]

rmse_mean = train_and_evaluate(X_train_mean, y_train_mean, X_val_mean, y_val_mean)

print("RMSE for filling with 0:", round(rmse_0, 2))
print("RMSE for filling with the mean:", round(rmse_mean, 2))

# Compare RMSE scores and determine which option gives better results


RMSE for filling with 0: 0.35
RMSE for filling with the mean: 0.35


In [29]:
#q4

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df is your DataFrame with the filtered data
# Assuming 'median_house_value' is your target column

# Fill NAs with 0
df_filled = df.fillna(0)

# Shuffle the dataset with seed 42
seed = 42
df_shuffled = df_filled.sample(frac=1, random_state=seed)

# Split the data into train/val/test sets (60%/20%/20%)
train_size = 0.6
val_size = 0.2
test_size = 0.2

train, test = train_test_split(df_shuffled, test_size=(val_size + test_size), random_state=seed)
val, test = train_test_split(test, test_size=test_size/(val_size + test_size), random_state=seed)

# Apply log transformation to the 'median_house_value' variable
target_column = 'median_house_value'
train[target_column] = np.log1p(train[target_column])
val[target_column] = np.log1p(val[target_column])
test[target_column] = np.log1p(test[target_column])

# Function to train a Ridge regression model and calculate RMSE
def train_and_evaluate_ridge(X_train, y_train, X_val, y_val, alpha):
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    return rmse

# Filling missing values with 0
X_train_ridge = train.drop(target_column, axis=1)
y_train_ridge = train[target_column]
X_val_ridge = val.drop(target_column, axis=1)
y_val_ridge = val[target_column]

# List of different alpha values to try
alpha_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

# Evaluate Ridge regression models with different alpha values
for alpha in alpha_values:
    rmse_ridge = train_and_evaluate_ridge(X_train_ridge, y_train_ridge, X_val_ridge, y_val_ridge, alpha)
    print(f"RMSE for alpha={alpha}: {round(rmse_ridge, 2)}")


RMSE for alpha=0: 0.35
RMSE for alpha=1e-06: 0.35
RMSE for alpha=0.0001: 0.35
RMSE for alpha=0.001: 0.35
RMSE for alpha=0.01: 0.35
RMSE for alpha=0.1: 0.35
RMSE for alpha=1: 0.35
RMSE for alpha=5: 0.35
RMSE for alpha=10: 0.35


In [31]:
#q5

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming df is your DataFrame with the filtered data
# Assuming 'median_house_value' is your target column

# Fill NAs with 0
df_filled = df.fillna(0)

# List of different seed values to try
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Initialize an empty list to store RMSE scores
rmse_scores = []

# Loop over different seed values
for seed in seed_values:
    # Shuffle the dataset with the current seed
    df_shuffled = df_filled.sample(frac=1, random_state=seed)

    # Split the data into train/val/test sets (60%/20%/20%)
    train_size = 0.6
    val_size = 0.2
    test_size = 0.2

    train, test = train_test_split(df_shuffled, test_size=(val_size + test_size), random_state=seed)
    val, test = train_test_split(test, test_size=test_size/(val_size + test_size), random_state=seed)

    # Apply log transformation to the 'median_house_value' variable
    target_column = 'median_house_value'
    train[target_column] = np.log1p(train[target_column])
    val[target_column] = np.log1p(val[target_column])
    test[target_column] = np.log1p(test[target_column])

    # Train a model without regularization
    X_train = train.drop(target_column, axis=1)
    y_train = train[target_column]
    X_val = val.drop(target_column, axis=1)
    y_val = val[target_column]

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    predictions = model.predict(X_val)
    
    # Calculate RMSE for the current seed
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    
    # Append the RMSE to the list
    rmse_scores.append(rmse)

# Calculate the standard deviation of all the scores
std_dev = np.std(rmse_scores)

# Round the result to 3 decimal digits
std_dev_rounded = round(std_dev, 3)

print(f"Standard Deviation of RMSE scores: {std_dev_rounded}")

Standard Deviation of RMSE scores: 0.006


In [32]:
#q6

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Assuming df is your DataFrame with the filtered data
# Assuming 'median_house_value' is your target column

# Fill NAs with 0
df_filled = df.fillna(0)

# Seed for reproducibility
seed = 9

# Shuffle the dataset with seed 9
df_shuffled = df_filled.sample(frac=1, random_state=seed)

# Split the data into train/val/test sets (60%/20%/20%)
train_size = 0.6
val_size = 0.2
test_size = 0.2

train, test = train_test_split(df_shuffled, test_size=(val_size + test_size), random_state=seed)
val, test = train_test_split(test, test_size=test_size/(val_size + test_size), random_state=seed)

# Combine train and validation datasets
train_combined = pd.concat([train, val])

# Apply log transformation to the 'median_house_value' variable
target_column = 'median_house_value'
train_combined[target_column] = np.log1p(train_combined[target_column])
test[target_column] = np.log1p(test[target_column])

# Train a model with r=0.001 (alpha=0.001)
X_train_combined = train_combined.drop(target_column, axis=1)
y_train_combined = train_combined[target_column]
X_test = test.drop(target_column, axis=1)
y_test = test[target_column]

# Train Ridge regression model with alpha=0.001
model = Ridge(alpha=0.001)
model.fit(X_train_combined, y_train_combined)

# Make predictions on the test set
predictions_test = model.predict(X_test)

# Calculate RMSE on the test dataset
rmse_test = np.sqrt(mean_squared_error(y_test, predictions_test))

print(f"RMSE on the test dataset: {round(rmse_test, 2)}")

RMSE on the test dataset: 0.33
