In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
df = pd.read_csv('housing.csv')

# Display the first few rows of the dataframe
print(df.head())

# Handle missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Identify categorical columns (assuming 'ocean_proximity' is a categorical column in the dataset)
categorical_columns = ['ocean_proximity']
numerical_columns = df.columns.drop(categorical_columns + ['median_house_value'])

# Create a column transformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Apply transformations to the dataframe
df_preprocessed = preprocessor.fit_transform(df)

# Convert to DataFrame for better handling
df_scaled = pd.DataFrame(df_preprocessed, columns=numerical_columns.tolist() + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns).tolist())

# Separate features and target
X = df_scaled
y = df['median_house_value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Save the model
joblib.dump(model, 'housing_price_model.pkl')
print("Model saved successfully!")

# Load the model
loaded_model = joblib.load('housing_price_model.pkl')

# Use the loaded model to make predictions
y_pred_loaded = loaded_model.predict(X_test)

# Evaluate the loaded model
mae_loaded = mean_absolute_error(y_test, y_pred_loaded)
mse_loaded = mean_squared_error(y_test, y_pred_loaded)
r2_loaded = r2_score(y_test, y_pred_loaded)

print(f'Mean Absolute Error (Loaded Model): {mae_loaded}')
print(f'Mean Squared Error (Loaded Model): {mse_loaded}')
print(f'R-squared (Loaded Model): {r2_loaded}')


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
Mean Absolute Error: 50670.73824097189
Mean Squared Error: 4908476721.15661

In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('housing.csv')

# Handle missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Identify categorical columns
categorical_columns = ['ocean_proximity']
numerical_columns = df.columns.drop(categorical_columns + ['median_house_value'])

# Create a column transformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Apply transformations to the dataframe
df_preprocessed = preprocessor.fit_transform(df)

# Convert to DataFrame for better handling
df_scaled = pd.DataFrame(df_preprocessed, columns=numerical_columns.tolist() + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns).tolist())

# Separate features and target
X = df_scaled
y = df['median_house_value']

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'housing_price_model.pkl')
print("Model saved successfully!")

# Load the model
loaded_model = joblib.load('housing_price_model.pkl')

# Function to preprocess and predict user input
def preprocess_and_predict(user_input):
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_input], columns=numerical_columns.tolist() + categorical_columns)
    
    # Handle categorical data (one-hot encoding)
    user_preprocessed = preprocessor.transform(user_df)
    
    # Make predictions
    prediction = loaded_model.predict(user_preprocessed)
    
    return prediction

# Take user input
user_input = {}
for column in numerical_columns:
    user_input[column] = float(input(f"Enter {column}: "))

for column in categorical_columns:
    user_input[column] = input(f"Enter {column} (e.g., NEAR BAY, INLAND, etc.): ")

# Predict based on user input
prediction = preprocess_and_predict(user_input)
print(f"Predicted Median House Value: ${prediction[0]:.2f}")


Model saved successfully!


Enter longitude:  -122.23
Enter latitude:  37.88
Enter housing_median_age:  41
Enter total_rooms:  880
Enter total_bedrooms:  129
Enter population:  322
Enter households:  126
Enter median_income:  8.3252
Enter ocean_proximity (e.g., NEAR BAY, INLAND, etc.):  NEAR BAY


Predicted Median House Value: $410584.36




In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('housing.csv')

# Handle missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Identify categorical columns
categorical_columns = ['ocean_proximity']
numerical_columns = df.columns.drop(categorical_columns + ['median_house_value'])

# Create a column transformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Apply transformations to the dataframe
df_preprocessed = preprocessor.fit_transform(df)

# Convert to DataFrame for better handling
feature_names = numerical_columns.tolist() + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns).tolist()
df_scaled = pd.DataFrame(df_preprocessed, columns=feature_names)

# Separate features and target
X = df_scaled
y = df['median_house_value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'housing_price_model.pkl')
print("Model saved successfully!")

# Load the model
loaded_model = joblib.load('housing_price_model.pkl')

# Function to preprocess and predict user input
def preprocess_and_predict(user_input):
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_input], columns=numerical_columns.tolist() + categorical_columns)
    
    # Handle categorical data (one-hot encoding)
    user_preprocessed = preprocessor.transform(user_df)
    
    # Convert to DataFrame with correct feature names
    user_preprocessed_df = pd.DataFrame(user_preprocessed, columns=feature_names)
    
    # Make predictions
    prediction = loaded_model.predict(user_preprocessed_df)
    
    return prediction

# Take user input
user_input = {}
for column in numerical_columns:
    user_input[column] = float(input(f"Enter {column}: "))

for column in categorical_columns:
    user_input[column] = input(f"Enter {column} (e.g., NEAR BAY, INLAND, etc.): ")

# Predict based on user input
prediction = preprocess_and_predict(user_input)
print(f"Predicted Median House Value: ${prediction[0]:.2f}")


Model saved successfully!


Enter longitude:  -122.23
