In [37]:
# Import necessary libraries for preprocessing
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge, ElasticNet

# Load the dataset
#Atlanta_Distance = pd.read_pickle(r'~/Documents/NYCDSA/Capstone/Pickle Files/Atlanta_Nearest_POI.pkl')
Atlanta_Distance = pd.read_csv('~/Documents/NYCDSA/Capstone/Nearest_Distances_Atlanta/Atlanta_Nearest_Distances.csv')
Atlanta_Distance['Total Bathrooms'] = Atlanta_Distance['number_of_full_baths'] + (Atlanta_Distance['number_of_half_baths']*0.5)
Atlanta_Distance = Atlanta_Distance.drop(columns=['latitude',
                                                  'longitude', 'lot_size_in_acres', 'number_of_full_baths', 'number_of_half_baths',
                                                  	'street_address', 'has_pool', 'has_garage',
                                                 'Supercenter_POI', 'Convenience_POI', 'Supermarket_POI', 'Wholesale_POI', 'Variety_Store_POI', 'story_indicator'])
Atlanta_Distance = Atlanta_Distance.rename(columns={'Distance_SM': 'Supermarket', 'Distance_C': 'Convenience',
'Distance_WS': 'Wholesale', 'Distance_VS': 'Variety Store', 'Distance_SC': 'Supercenter', 'number_of_bedrooms': 'Total Bedrooms', 'living_area_square_feet': 'Living Area(SQF)',
                                                   'lot_size_in_square_feet': 'Lot Size(SQF)', 'number_of_garage_spaces': 'Garage Spaces', 'number_of_stories': 'Stories', 'year_built': 'Year Built'})
Atlanta_Distance['Year Built'] = Atlanta_Distance['Year Built'].astype('Int64')
Atlanta_Distance

Unnamed: 0,current_listing_price,listing_date,Year Built,Total Bedrooms,Living Area(SQF),Lot Size(SQF),Garage Spaces,Stories,Wholesale,Variety Store,Supermarket,Supercenter,Convenience,Total Bathrooms
0,2865,3/12/2024,2002,4,3259.0,14810.400390,,2.0,10917.45,6405.99,1654.08,2539.28,2715.74,2.5
1,2200,3/2/2024,,3,1993.0,,,,32739.68,405.11,3065.55,492.63,1407.26,2.0
2,2000,2/15/2024,,3,1910.0,,,,29267.11,4738.04,3780.13,25413.80,3213.19,2.0
3,1995,3/17/2024,,3,1134.0,,,,9020.53,4356.55,2320.59,7857.71,2385.60,1.0
4,2800,3/3/2024,,4,2532.0,,,,14750.79,6721.22,5625.35,11676.91,14263.22,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21010,1600,3/9/2023,,3,1367.0,,,,47285.12,6409.21,2097.35,19852.16,4983.93,2.0
21011,2199,3/4/2024,,3,1696.0,871.199981,,,2009.36,6142.54,888.22,4864.86,1477.04,2.5
21012,2380,3/16/2024,2015,3,1944.0,4356.000065,,,1507.23,714.63,1158.08,1359.41,3952.51,3.5
21013,1900,3/4/2024,,4,1204.0,,,,40632.51,1402.52,8779.81,10787.35,1242.13,2.0


In [38]:
# Exclude 'current_listing_price' from features
numeric_features = Atlanta_Distance.select_dtypes(include=['int64', 'float64']).drop(columns=['current_listing_price']).columns
#categorical_features = list(Atlanta_Distance.columns[Atlanta_Distance.dtypes == 'object'])

In [39]:
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")

# Pipeline for numeric features: Impute missing values using mean
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline for nominal categorical features: Fill missing values with 'None' then apply one-hot encoding
#categorical_transformer = Pipeline(steps=[
    #('impute_none', SimpleImputer(strategy='constant', fill_value='None')),  # Using SimpleImputer
    #('onehot', OneHotEncoder(handle_unknown='ignore'))
#])

# Combined preprocessor for numeric and nominal
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('nominal', categorical_transformer, categorical_features)
])

transformed_data = preprocessor.fit_transform(Atlanta_Distance)

In [40]:
# Generate column names for the one-hot encoded features
#onehot_features = preprocessor.named_transformers_['nominal'].named_steps['onehot'].get_feature_names_out()
# Combine all feature names
all_feature_names = list(numeric_features)
all_feature_names

['Year Built',
 'Total Bedrooms',
 'Living Area(SQF)',
 'Lot Size(SQF)',
 'Garage Spaces',
 'Stories',
 'Wholesale',
 'Variety Store',
 'Supermarket',
 'Supercenter',
 'Convenience',
 'Total Bathrooms']

In [41]:
transformed_df = pd.DataFrame(transformed_data, columns=all_feature_names)
#transformed_df

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
model = LinearRegression()

X = Atlanta_Distance.drop(columns='current_listing_price')
y = Atlanta_Distance['current_listing_price']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

mlr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model),
])
# Initialize the linear regression model
# Fit the model on the training data
mlr_pipeline.fit(X_train, y_train)

In [43]:
val_score = mlr_pipeline.score(X_val, y_val) 
print('Validation R^2:', val_score)

test_score = mlr_pipeline.score(X_test, y_test)
print('Test R^2:', test_score)

Validation R^2: 0.3290665928415044
Test R^2: 0.25738146438896525


In [44]:
from sklearn.metrics import mean_squared_error 
y_val_pred = mlr_pipeline.predict(X_val)

val_mse = mean_squared_error(y_val, y_val_pred)
print('Validation Mean Squared Error:', val_mse)
val_rmse = np.sqrt(val_mse)
print("Validation Root Mean Squared Error:", val_rmse)

y_test_pred = mlr_pipeline.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
print('Test Mean Squared Error:', test_mse)
test_rmse = np.sqrt(test_mse)
print("Test Root Mean Squared Error:", test_rmse)

Validation Mean Squared Error: 559709.70597127
Validation Root Mean Squared Error: 748.1374913552121
Test Mean Squared Error: 634274.5192236298
Test Root Mean Squared Error: 796.4135353091569


In [45]:
def calculate_mape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    return mape

val_mape = calculate_mape(y_val, y_val_pred)
test_mape = calculate_mape(y_test, y_test_pred)

print("Validation Mean Absolute Percentage Error (MAPE):", val_mape)
print("Test Mean Absolute Percentage Error (MAPE):", test_mape)

Validation Mean Absolute Percentage Error (MAPE): 19.83050660514431
Test Mean Absolute Percentage Error (MAPE): 20.402755204346636


In [46]:
coefficients = mlr_pipeline.named_steps['regressor'].coef_
intercept = mlr_pipeline.named_steps['regressor'].intercept_

# Display the coefficients
print("Coefficients (MLE):", coefficients)
print("Intercept (MLE):", intercept)

Coefficients (MLE): [ -67.65205147   71.11270349  226.72635004   -4.58153361    5.27171034
   41.08192536   37.63590249   31.30777317  -41.94421226 -118.60248965
  -77.24639081  302.2973761 ]
Intercept (MLE): 2270.442382425251


In [50]:
coef_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient (MLE)': coefficients
})
intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient (MLE)': [intercept]})
coef_df = pd.concat([coef_df, intercept_row], ignore_index=True)

coef_df['Absolute Coefficient'] = coef_df['Coefficient (MLE)']  # Create a new column for absolute values
coef_df = coef_df.sort_values(by='Absolute Coefficient', ascending=False).drop(columns='Absolute Coefficient')  # Sort and drop the temporary column

# Display the coefficients with feature names sorted
print(coef_df)

             Feature  Coefficient (MLE)
12         Intercept        2270.442382
11   Total Bathrooms         302.297376
2   Living Area(SQF)         226.726350
1     Total Bedrooms          71.112703
5            Stories          41.081925
6          Wholesale          37.635902
7      Variety Store          31.307773
4      Garage Spaces           5.271710
3      Lot Size(SQF)          -4.581534
8        Supermarket         -41.944212
0         Year Built         -67.652051
10       Convenience         -77.246391
9        Supercenter        -118.602490


In [69]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(random_state=42)
param = {
    'regressor__alpha': [100]
}
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge_model),
])
ridge_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param,
    scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_percentage_error', 'neg_root_mean_squared_error'],
    refit='r2', 
    cv=10,  # Number of cross-validation folds
    n_jobs=-1,  # Use all available cores
    verbose=2,
)

# Fit RandomizedSearchCV
ridge_search.fit(X_train, y_train)
ridge_pipeline.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [70]:
best_model = ridge_search.best_estimator_
results = ridge_search.cv_results_
mean_r2_scores = results['mean_test_r2']
mean_mse_scores = -results['mean_test_neg_mean_squared_error']
mean_mape_scores = -results['mean_test_neg_mean_absolute_percentage_error']
mean_RMSE_scores = -results['mean_test_neg_root_mean_squared_error']

print("Mean MSE scores:", mean_mse_scores)
print("Mean MAPE scores:", mean_mape_scores)
print("Mean R^2 scores:", mean_r2_scores)
print("Mean RMSE scores:", mean_RMSE_scores)

Mean MSE scores: [547716.68125222]
Mean MAPE scores: [0.19798088]
Mean R^2 scores: [0.33157401]
Mean RMSE scores: [739.37765513]


In [71]:
print("Best parameters:", ridge_search.best_params_)
print("Best R^2:", ridge_search.best_score_)

Best parameters: {'regressor__alpha': 100}
Best R^2: 0.3315740091363374


In [72]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, mean_squared_error
def calculate_scores(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return r2, mse, mape, rmse

In [73]:
y_val_pred_tuned = best_model.predict(X_val)
val_scores = calculate_scores(y_val, y_val_pred_tuned)

y_test_pred_tuned = best_model.predict(X_test)
test_scores = calculate_scores(y_test, y_test_pred_tuned)

print("Tuned Validation Set Scores:")
print(f"R²: {val_scores[0]:.4f}")
print(f"MSE: {val_scores[1]:.4f}")
print(f"MAPE: {val_scores[2]:.4f}")
print(f"RMSE: {val_scores[3]:.4f}")

print("\nTuned Test Set Scores:")
print(f"R²: {test_scores[0]:.4f}")
print(f"MSE: {test_scores[1]:.4f}")
print(f"MAPE: {test_scores[2]:.4f}")
print(f"RMSE: {test_scores[3]:.4f}")

Tuned Validation Set Scores:
R²: 0.3292
MSE: 559639.9084
MAPE: 0.1981
RMSE: 748.0908

Tuned Test Set Scores:
R²: 0.2582
MSE: 633533.7671
MAPE: 0.2038
RMSE: 795.9483


In [74]:
coefficients = ridge_pipeline.named_steps['regressor'].coef_
intercept = ridge_pipeline.named_steps['regressor'].intercept_

In [76]:
coef_df_ridge = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient (MLE)': coefficients
})

#Add intercept row
intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient (MLE)': [intercept]})
coef_df_ridge = pd.concat([coef_df_ridge, intercept_row], ignore_index=True)

# Sort the coefficients by absolute value in descending order
coef_df_ridge['Absolute Coefficient'] = coef_df_ridge['Coefficient (MLE)']  # Create a new column for absolute values
coef_df_ridge = coef_df_ridge.sort_values(by='Absolute Coefficient', ascending=False).drop(columns='Absolute Coefficient')  # Sort and drop the temporary column

# Display the coefficients with feature names sorted
print(coef_df_ridge)

             Feature  Coefficient (MLE)
12         Intercept        2270.442382
11   Total Bathrooms         299.046354
2   Living Area(SQF)         225.205321
1     Total Bedrooms          72.870122
5            Stories          41.427176
6          Wholesale          36.664429
7      Variety Store          30.093242
4      Garage Spaces           5.430233
3      Lot Size(SQF)          -4.504124
8        Supermarket         -41.862591
0         Year Built         -66.370092
10       Convenience         -76.229169
9        Supercenter        -117.110609
