In [12]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge, ElasticNet
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
# Pipeline for numeric features: Impute missing values then scale
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Preprocessor for numeric data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])
# Target variable
y = Ames['SalePrice']
# Numeric features
X = Ames[numeric_features.tolist()]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
}
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y)
    results[name] = round(scores.mean(), 4)
# Output the cross-validation scores
print("Cross-validation scores with Simple Imputer:", results) 

Cross-validation scores with Simple Imputer: {'Lasso': 0.8636}


In [13]:
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
categorical_features = Ames.select_dtypes(include=['object']).columns
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")
# Pipeline for numeric features: Impute missing values then scale
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Pipeline for general categorical features: Fill missing values with 'None' then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('fill_none', FunctionTransformer(fill_none, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Preprocessor for numeric data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), 
        ('cat', categorical_transformer, categorical_features)
    ])
# Target variable
y = Ames['SalePrice']
# Numeric features
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
}
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y)
    results[name] = round(scores.mean(), 4)
# Output the cross-validation scores
print("Cross-validation scores with Simple Imputer:", results) 

KeyboardInterrupt: 

In [14]:
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
categorical_features = Ames.select_dtypes(include=['object']).columns
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")
# Pipeline for numeric features: Impute missing values then scale
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Pipeline for general categorical features: Fill missing values with 'None' then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('fill_none', FunctionTransformer(fill_none, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Preprocessor for numeric data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), 
        ('cat', categorical_transformer, categorical_features)
    ])
# Target variable
y = Ames['SalePrice']
# Numeric features
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
    'Ridge': Ridge(),
    'Elastic Net': ElasticNet(),
    }
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y)
    results[name] = round(scores.mean(), 4)
# Output the cross-validation scores
print("Cross-validation scores with Simple Imputer:", results) 

Cross-validation scores with Simple Imputer: {'Lasso': 0.9094, 'Ridge': 0.913, 'Elastic Net': 0.8742}


In [16]:
# Import the necessary libraries
from sklearn.linear_model import Ridge, ElasticNet
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
# Exclude 'PID' and 'SalePrice' from features and specifically handle the 'Electrical' column
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
categorical_features = Ames.select_dtypes(include=['object']).columns.difference(['Electrical'])
electrical_feature = ['Electrical']  # Specifically handle the 'Electrical' column
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")
# Pipeline for numeric features: Impute missing values then scale
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Pipeline for general categorical features: Fill missing values with 'None' then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('fill_none', FunctionTransformer(fill_none, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Specific transformer for 'Electrical' using the mode for imputation
electrical_transformer = Pipeline(steps=[
    ('impute_electrical', SimpleImputer(strategy='most_frequent')),
    ('onehot_electrical', OneHotEncoder(handle_unknown='ignore'))
])
# Combined preprocessor for numeric, general categorical, and electrical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('electrical', electrical_transformer, electrical_feature)
    ])
# Target variable
y = Ames['SalePrice']
# All features
X = Ames[numeric_features.tolist() + categorical_features.tolist() + electrical_feature]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y)
    results[name] = round(scores.mean(), 4)
# Output the cross-validation scores
print("Cross-validation scores with Simple Imputer:", results)

Cross-validation scores with Simple Imputer: {'Lasso': 0.9093, 'Ridge': 0.913, 'ElasticNet': 0.8742}


In [18]:
# Import the necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer  # This line is needed for IterativeImputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
# Exclude 'PID' and 'SalePrice' from features and specifically handle the 'Electrical' column
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
categorical_features = Ames.select_dtypes(include=['object']).columns.difference(['Electrical'])
electrical_feature = ['Electrical']  # Specifically handle the 'Electrical' column
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")
# Pipeline for numeric features: Iterative imputation then scale
numeric_transformer_advanced = Pipeline(steps=[
    ('impute_iterative', IterativeImputer(random_state=42)),
    ('scaler', StandardScaler())
])
# Pipeline for general categorical features: Fill missing values with 'None' then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('fill_none', FunctionTransformer(fill_none, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Specific transformer for 'Electrical' using the mode for imputation
electrical_transformer = Pipeline(steps=[
    ('impute_electrical', SimpleImputer(strategy='most_frequent')),
    ('onehot_electrical', OneHotEncoder(handle_unknown='ignore'))
])
# Combined preprocessor for numeric, general categorical, and electrical data
preprocessor_advanced = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_advanced, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('electrical', electrical_transformer, electrical_feature)
    ])
# Target variable
y = Ames['SalePrice']
# All features
X = Ames[numeric_features.tolist() + categorical_features.tolist() + electrical_feature]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}
results_advanced = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor_advanced),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y)
    results_advanced[name] = round(scores.mean(), 4)
# Output the cross-validation scores for advanced imputation
print("Cross-validation scores with Iterative Imputer:", results_advanced)

KeyboardInterrupt: 

In [24]:
# Import the necessary libraries
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score
# Load the dataset
Ames = pd.read_csv('~/Documents/NYCDSA/Third Project/ML_Ames_Housing/data/Ames_Housing_Price_Data.csv')
# Exclude 'PID' and 'SalePrice' from features and specifically handle the 'Electrical' column
numeric_features = Ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice']).columns
categorical_features = Ames.select_dtypes(include=['object']).columns.difference(['Electrical'])
electrical_feature = ['Electrical']  # Specifically handle the 'Electrical' column
# Helper function to fill 'None' for missing categorical data
def fill_none(X):
    return X.fillna("None")
# Pipeline for numeric features: Impute missing values then scale
numeric_transformer = Pipeline(steps=[
    ('impute_mean', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Pipeline for general categorical features: Fill missing values with 'None' then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('fill_none', FunctionTransformer(fill_none, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Specific transformer for 'Electrical' using the mode for imputation
electrical_transformer = Pipeline(steps=[
    ('impute_electrical', SimpleImputer(strategy='most_frequent')),
    ('onehot_electrical', OneHotEncoder(handle_unknown='ignore'))
])
# Combined preprocessor for numeric, general categorical, and electrical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('electrical', electrical_transformer, electrical_feature)
    ])
# Target variable
y = Ames['SalePrice']
# All features
X = Ames[numeric_features.tolist() + categorical_features.tolist() + electrical_feature]
# Define the model pipelines with preprocessor and regressor
models = {
    'Lasso': Lasso(max_iter=20000),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(max_iter=20000)
}
# Define parameter grids for each regressor
param_grids = {
    'Lasso': {'regressor__alpha': [10, 20, 30, 40]},
    'Ridge': {'regressor__alpha': [10, 20, 30, 40]},
    'ElasticNet': {'regressor__alpha': [0.01, 0.1, 1, 10], 'regressor__l1_ratio': [0.2, 0.5, 0.8]}
}
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Set up the GridSearchCV to find the best parameters
    grid_search = GridSearchCV(pipeline, param_grid=param_grids[name], cv=5, n_jobs=-1)
    grid_search.fit(X, y)
    best_score = round(grid_search.best_score_, 4)
    best_params = grid_search.best_params_
    results[name] = {'Best Score': best_score, 'Best Params': best_params}
# Output the grid search results
print("Grid Search Results:", results)

Grid Search Results: {'Lasso': {'Best Score': 0.9135, 'Best Params': {'regressor__alpha': 35}}, 'Ridge': {'Best Score': 0.9112, 'Best Params': {'regressor__alpha': 35}}, 'ElasticNet': {'Best Score': 0.9135, 'Best Params': {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.8}}}
