In [1]:
# Load the Ames dataset
import pandas as pd
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
# Import Linear Regression, Train-Test, Cross-Validation from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
# Select features and target
X = Ames[['GrLivArea']]  # Feature: GrLivArea
y = Ames['SalePrice']    # Target: SalePrice
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear Regression model using Train-Test
model = LinearRegression()
model.fit(X_train, y_train)
train_test_score = round(model.score(X_test, y_test), 4)
print(f"Train-Test R^2 Score: {train_test_score}")
# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=5)
cv_scores_rounded = [round(score, 4) for score in cv_scores]
print(f"Cross-Validation R^2 Scores: {cv_scores_rounded}")

Train-Test R^2 Score: 0.4855
Cross-Validation R^2 Scores: [0.4884, 0.5412, 0.5214, 0.5458, 0.4672]


In [2]:
# Load only categorical columns without missing values from the Ames dataset
import pandas as pd
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv').select_dtypes(include=['object']).dropna(axis=1)
print(f"The shape of the DataFrame before One Hot Encoding is: {Ames.shape}")
# Import OneHotEncoder and apply it to Ames:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
Ames_One_Hot = encoder.fit_transform(Ames)
# Convert the encoded result back to a DataFrame
Ames_encoded_df = pd.DataFrame(Ames_One_Hot, columns=encoder.get_feature_names_out(Ames.columns))
# Display the new DataFrame and it's expanded shape
print(Ames_encoded_df.head())
print(f"The shape of the DataFrame after One Hot Encoding is: {Ames_encoded_df.shape}")

The shape of the DataFrame before One Hot Encoding is: (2580, 27)
   MSZoning_A (agr)  MSZoning_C (all)  MSZoning_FV  MSZoning_I (all)  \
0               0.0               0.0          0.0               0.0   
1               0.0               0.0          0.0               0.0   
2               0.0               1.0          0.0               0.0   
3               0.0               0.0          0.0               0.0   
4               0.0               0.0          0.0               0.0   

   MSZoning_RH  MSZoning_RL  MSZoning_RM  Street_Grvl  Street_Pave  \
0          0.0          1.0          0.0          0.0          1.0   
1          0.0          1.0          0.0          0.0          1.0   
2          0.0          0.0          0.0          0.0          1.0   
3          0.0          1.0          0.0          0.0          1.0   
4          0.0          1.0          0.0          0.0          1.0   

   LotShape_IR1  ...  SaleType_New  SaleType_Oth  SaleType_VWD  SaleType_WD   \




In [5]:
# Load only categorical columns without missing values from the Ames dataset
import pandas as pd
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv').select_dtypes(include=['object']).dropna(axis=1)
print(f"The shape of the DataFrame before One Hot Encoding is: {Ames.shape}")
# Import OneHotEncoder and apply it to Ames:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
Ames_One_Hot = encoder.fit_transform(Ames)
# Convert the encoded result back to a DataFrame
Ames_encoded_df = pd.DataFrame(Ames_One_Hot, columns=encoder.get_feature_names_out(Ames.columns))
# Display the new DataFrame and it's expanded shape
print(Ames_encoded_df.head())
print(f"The shape of the DataFrame after One Hot Encoding is: {Ames_encoded_df.shape}")

# Buidling on the code above to identify top categorical feature
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
# Set 'SalePrice' as the target variable
y = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')['SalePrice']
# Dictionary to store feature names and their corresponding mean CV R² scores
feature_scores = {}
for feature in Ames.columns:
    encoder = OneHotEncoder(drop='first')
    X_encoded = encoder.fit_transform(Ames[[feature]])
    # Initialize the linear regression model
    model = LinearRegression()
    # Perform 5-fold cross-validation and calculate R^2 scores
    scores = cross_val_score(model, X_encoded, y)
    mean_score = scores.mean()
    # Store the mean R^2 score
    feature_scores[feature] = mean_score
# Sort features based on their mean CV R² scores in descending order
sorted_features = sorted(feature_scores.items(), key=lambda item: item[1], reverse=True)
print("Feature selected for highest predictability:", sorted_features[0][0])

The shape of the DataFrame before One Hot Encoding is: (2580, 27)
   MSZoning_A (agr)  MSZoning_C (all)  MSZoning_FV  MSZoning_I (all)  \
0               0.0               0.0          0.0               0.0   
1               0.0               0.0          0.0               0.0   
2               0.0               1.0          0.0               0.0   
3               0.0               0.0          0.0               0.0   
4               0.0               0.0          0.0               0.0   

   MSZoning_RH  MSZoning_RL  MSZoning_RM  Street_Grvl  Street_Pave  \
0          0.0          1.0          0.0          0.0          1.0   
1          0.0          1.0          0.0          0.0          1.0   
2          0.0          0.0          0.0          0.0          1.0   
3          0.0          1.0          0.0          0.0          1.0   
4          0.0          1.0          0.0          0.0          1.0   

   LotShape_IR1  ...  SaleType_New  SaleType_Oth  SaleType_VWD  SaleType_WD   \




Feature selected for highest predictability: Neighborhood


In [6]:
# Building on the code above to determine the performance of top 5 categorical features
print("Top 5 Categorical Features:")
for feature, score in sorted_features[0:5]:
    print(f"{feature}: Mean CV R² = {score:.4f}")

Top 5 Categorical Features:
Neighborhood: Mean CV R² = 0.5406
ExterQual: Mean CV R² = 0.4649
KitchenQual: Mean CV R² = 0.4369
Foundation: Mean CV R² = 0.2545
HeatingQC: Mean CV R² = 0.1889
