In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error



In [2]:
data = pd.read_csv("ocean_data.csv")

# Drop non-numerical columns
df = data.drop(columns = ['Wea', 'Cloud_Typ', 'Cloud_Amt', 'Visibility'])

target_variable = df['T_degC']

numerical_features = df.drop(columns = ['T_degC'])

X_train, X_test, y_train, y_test = train_test_split(numerical_features, target_variable, test_size=0.25, random_state=307)


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression

In [4]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('standardize', StandardScaler()),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
yhat = pipe.predict(X_test)

In [5]:

# (c) Report the training MSE
ytrain_predict = pipe.predict(X_train)
training_mse = mean_squared_error(y_train, ytrain_predict)
print("Training MSE:", training_mse)

# (d) Report the test MSE
ytest_pred = pipe.predict(X_test)
test_mse = mean_squared_error(y_test, ytest_pred)
print("Test MSE:", test_mse)

variance_ytest = y_test.var()

print("Test MSE compared to the variance of ytest:", test_mse / variance_ytest)

Training MSE: 1.535095636242086
Test MSE: 1.7543308610948567
Test MSE compared to the variance of ytest: 0.11833893287694301


In [6]:
coefficients = pipe.named_steps['model'].coef_
feature_names = pipe.named_steps['poly_features'].get_feature_names_out()

# Combine coefficients and feature names into a DataFrame for easier analysis
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Order the features by the magnitude of their coefficients
coefficients_df = coefficients_df.reindex(
    coefficients_df['Coefficient'].sort_values(ascending=False).index
)

# (b) Get the features and coefficients with the three largest positive coefficients
top_positive = coefficients_df.head(3)

# (c) Get the features and coefficients with the three largest negative coefficients
top_negative = coefficients_df.tail(3)

# (d) Get the estimated y-intercept of the fitted model
y_intercept = pipe.named_steps['model'].intercept_

# Print the results
print("\nFeatures and coefficients with the three largest positive coefficients:")
print(top_positive)
print("\nFeatures and coefficients with the three largest negative coefficients:")
print(top_negative)
print("\nEstimated y-intercept of the fitted model:", y_intercept)



Features and coefficients with the three largest positive coefficients:
   Feature  Coefficient
9    x0 x2   147.569766
12   x0 x5    35.153778
8    x0 x1    32.217666

Features and coefficients with the three largest negative coefficients:
  Feature  Coefficient
5      x5   -34.383804
1      x1   -37.244657
2      x2  -146.550613

Estimated y-intercept of the fitted model: 11.607865265528682


In [7]:
coefficients_df
feature_names

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x0^2', 'x0 x1', 'x0 x2',
       'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x1^2', 'x1 x2', 'x1 x3',
       'x1 x4', 'x1 x5', 'x1 x6', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5',
       'x2 x6', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x4^2', 'x4 x5',
       'x4 x6', 'x5^2', 'x5 x6', 'x6^2'], dtype=object)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
import time

# (a) Create the pipeline with KNN model
knn_pipe = Pipeline([
    ('impute', SimpleImputer()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('model', KNeighborsRegressor())
])

# (b) Set up the hyperparameter grid
params = {
    'impute__strategy': ['mean', 'median'],
    'poly__degree': [1, 2, 3],
    'model__n_neighbors': list(range(5, 101, 5)),
    'model__weights': ['uniform', 'distance']
}

# (c) Instantiate and fit GridSearchCV
knn_gs = GridSearchCV(knn_pipe, param_grid=params, scoring='neg_mean_squared_error', cv=10)
start_time = time.time()
knn_gs.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# (d) Print the best hyperparameter combinations and the best MSE
print("Best Hyperparameters:", knn_gs.best_params_)
print("Best Negative MSE:", knn_gs.best_score_)

# (e) Use the optimized pipeline to get predictions for the test data
ytest_pred_optimized = knn_gs.predict(X_test)

# (f) Report the test MSE
test_mse_optimized = mean_squared_error(y_test, ytest_pred_optimized)
print("Test MSE with Optimized Pipeline:", test_mse_optimized)

# (g) Compare with the pipeline in part (2)
test_mse_part2 = mean_squared_error(y_test, yhat)  # Assuming yhat is the prediction from part (2)
print("Time taken to fit GridSearchCV:", elapsed_time)
print("Test MSE in part (2):", test_mse_part2)



Best Hyperparameters: {'impute__strategy': 'mean', 'model__n_neighbors': 15, 'model__weights': 'distance', 'poly__degree': 1}
Best Negative MSE: -2.009963445544637
Test MSE with Optimized Pipeline: 2.087222401430895
Time taken to fit GridSearchCV: 111.91906237602234
Test MSE in part (2): 1.7543308610948567


In [9]:
from sklearn.model_selection import RandomizedSearchCV

# (a) Create the pipeline with KNN model
knn_pipe_random = Pipeline([
    ('impute', SimpleImputer()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('model', KNeighborsRegressor())
])

# (b) Set up the hyperparameter grid for RandomizedSearchCV
params_random = {
    'impute__strategy': ['mean', 'median'],
    'poly__degree': [1, 2, 3],
    'model__n_neighbors': list(range(5, 101, 5)),
    'model__weights': ['uniform', 'distance']
}

# (c) Instantiate and fit RandomizedSearchCV
knn_rs = RandomizedSearchCV(
    knn_pipe_random, param_distributions=params_random,
    scoring='neg_mean_squared_error', cv=10, n_iter=10, random_state=42
)
start_time_random = time.time()
knn_rs.fit(X_train, y_train)
elapsed_time_random = time.time() - start_time_random

# (d) Print the best hyperparameter combinations and the best MSE
print("Best Hyperparameters (RandomizedSearchCV):", knn_rs.best_params_)
print("Best Negative MSE (RandomizedSearchCV):", knn_rs.best_score_)

# (e) Use the optimized pipeline to get predictions for the test data
ytest_pred_optimized_random = knn_rs.predict(X_test)

# (f) Report the test MSE
test_mse_optimized_random = mean_squared_error(y_test, ytest_pred_optimized_random)
print("Test MSE with Optimized Pipeline (RandomizedSearchCV):", test_mse_optimized_random)

# (g) Compare with the pipeline in part (2)
test_mse_part2 = mean_squared_error(y_test, yhat)  # Assuming yhat is the prediction from part (2)
print("Test MSE in part (2):", test_mse_part2)

# (h) Compare the results of both methods
print("\nComparison of Results:")
print("Time taken to fit RandomizedSearchCV:", elapsed_time_random)
print("Time taken to fit GridSearchCV:", elapsed_time)
print("Best Hyperparameters (GridSearchCV):", knn_gs.best_params_)
print("Best Negative MSE (GridSearchCV):", knn_gs.best_score_)
print("Best Hyperparameters (RandomizedSearchCV):", knn_rs.best_params_)
print("Best Negative MSE (RandomizedSearchCV):", knn_rs.best_score_)


Best Hyperparameters (RandomizedSearchCV): {'poly__degree': 1, 'model__weights': 'distance', 'model__n_neighbors': 10, 'impute__strategy': 'mean'}
Best Negative MSE (RandomizedSearchCV): -2.051993277466038
Test MSE with Optimized Pipeline (RandomizedSearchCV): 2.080069382923031
Test MSE in part (2): 1.7543308610948567

Comparison of Results:
Time taken to fit RandomizedSearchCV: 4.119412899017334
Time taken to fit GridSearchCV: 111.91906237602234
Best Hyperparameters (GridSearchCV): {'impute__strategy': 'mean', 'model__n_neighbors': 15, 'model__weights': 'distance', 'poly__degree': 1}
Best Negative MSE (GridSearchCV): -2.009963445544637
Best Hyperparameters (RandomizedSearchCV): {'poly__degree': 1, 'model__weights': 'distance', 'model__n_neighbors': 10, 'impute__strategy': 'mean'}
Best Negative MSE (RandomizedSearchCV): -2.051993277466038


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Assuming 'data' is your DataFrame
data = pd.read_csv("ocean_data.csv")

# Separate numeric and categorical features
numeric_features = data.select_dtypes(include=['float64', 'int64'])
categorical_features = data[['Wea', 'Cloud_Typ', 'Cloud_Amt', 'Visibility']]

# Separate target variable
target_variable = numeric_features['T_degC']

# Drop target variable from numeric features
numeric_features = numeric_features.drop(['T_degC'], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    pd.concat([numeric_features, categorical_features], axis=1), target_variable, test_size=0.25, random_state=307
)

# Define numeric transformer
numeric_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scale', StandardScaler())
])

# Define categorical transformer
categorical_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
    ('select', SelectPercentile(f_regression, percentile=50))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features.columns),
        ('cat', categorical_transformer, categorical_features.columns)
    ])

# Build the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=20, weights='distance'))
])

# (a) Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# (b) Report the training and test mse
y_train_pred = pipeline.predict(X_train)
training_mse = mean_squared_error(y_train, y_train_pred)
print("Training MSE:", training_mse)

y_test_pred = pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Test MSE:", test_mse)

# (c) Compare with the model in part (2)
print("Test MSE from part (2):", test_mse_part2)


Training MSE: 7.932936716529455e-13
Test MSE: 1.4623776711365377
Test MSE from part (2): 1.7543308610948567


In [24]:
# Load the new data
new_data = pd.read_csv("ocean_data.csv")

# Separate numeric and categorical features
numeric_features = new_data.select_dtypes(include=['float64', 'int64'])
categorical_features = new_data[['Wea', 'Cloud_Typ', 'Cloud_Amt', 'Visibility']]

# Combine numeric and categorical features
X_new = pd.concat([numeric_features, categorical_features], axis=1)

# Predict water temperature using the selected model
y_new_pred = pipeline.predict(X_new)

# Save predictions to a CSV file
pd.DataFrame(y_new_pred).to_csv("predictions.csv", header=False, index=False)

In [15]:
# SECTION 2
icecream_df = pd.read_csv("icecreamcone2.csv")

X = icecream_df.drop(['logBV'], axis=1)
y = icecream_df['logBV']

In [16]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Create StandardScaler and PolynomialFeatures objects
scaler = StandardScaler()
poly = PolynomialFeatures(degree=1)  # Change the degree if you want polynomial features

# Standardize the features
X_standardized = scaler.fit_transform(X)

# Add a column for the intercept (if not using PolynomialFeatures with degree>1)
X_processed = poly.fit_transform(X_standardized)

# Now X_processed contains standardized features with an added column for the intercept


In [17]:
# Reshape the target vector
y = y.values.reshape(-1, 1)


In [18]:
import numpy as np

# Assuming X_processed is the standardized and processed feature matrix
# and y is the target vector with an added dimension
betas = np.linalg.inv(X_processed.T @ X_processed) @ X_processed.T @ y


In [19]:
betas

array([[ 4.84548985],
       [-0.25818952],
       [-0.08677703],
       [-0.29603492],
       [ 0.28041044]])

In [20]:
import numpy as np

def linear_regression_gradient_descent(X, y, learning_rate, epochs):
    """
    Perform linear regression using gradient descent.

    Parameters:
    - X: 2D numpy array with features
    - y: 1D numpy array with the target
    - learning_rate: learning rate for gradient descent
    - epochs: number of iterations

    Returns:
    - betas: estimated coefficients
    """
    # Initialize coefficients
    betas = np.zeros((X.shape[1], 1))

    # Number of samples
    m = len(y)

    for epoch in range(epochs):
        # Calculate predictions
        predictions = X @ betas

        # Calculate the error
        error = predictions - y.reshape(-1, 1)

        # Calculate gradients
        gradients = 2/m * X.T @ error

        # Update coefficients
        betas -= learning_rate * gradients

    return betas


In [21]:
# Assuming X_processed is the standardized and processed feature matrix
# and y is the target vector with an added dimension
learning_rate = 0.01
epochs = 1000

estimated_betas = linear_regression_gradient_descent(X_processed, y, learning_rate, epochs)
print("Estimated Coefficients:", estimated_betas)


Estimated Coefficients: [[ 4.84548984]
 [-0.25795717]
 [-0.08664055]
 [-0.29577712]
 [ 0.28076943]]


In [None]:
# (d) Your answer: The gradient boosting betas are super close to the normal equation betas. However, the gradient boosting betas are a bit tighter and closer to 0 which is a sign of a more accurate model.