### Problem Statement

Your task is to build a model to forecast the vector Y using the variables in X and Z.   
You can assume that the conditional expectation of Y given X is linear in X. 

1.	Examine and present the main characteristics of the data.
2.	Propose a forecasting model for Y only using the variables in X without Z and explain its properties.
3.	Further improve the modeling from (2) with both X and Z.
4.	Evaluate the quality of your models and of their parameter estimates. Which one produces the best forecast? Interpret why.

## Assumptions:
  1. I'm assuming that the index is significant and going to key off of it that is to say index _n_ in df X corresponds to index _n_ in df Y
  2. From analizing the data, I don't think it's time series as it doesn't look browninan 

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

: 

In [None]:
# Read in datasets 
data_pth = 'interview_problem_sets/Aquatic/data/'
X = pd.read_csv(data_pth + 'X.csv',index_col=0)
Y = pd.read_csv(data_pth + 'Y.csv',index_col=0)
Z = pd.read_csv(data_pth + 'Z.csv',index_col=0)

#fix dtype of columns from str to int
X.columns = X.columns.astype(int)
Y.columns = Y.columns.astype(int)
Z.columns = Z.columns.astype(int) 



### Exploratory data analysis

In [None]:
# Check for missing values and describe the data
print('========X========')
display(X.info())
display(X.describe())
print('========Y========')
display(Y.info())
display(Y.describe())
print('========Z========')
display(Z.info())
display(Z.describe())

## Notes
 - There seems to be some missing datapoints in X that don't correspond to Y and Z 
 - First task is cleaning the dataset to show some relationships between Y and Z 
 - since the actual number of nulls are such a small part of the dataset, we will drop rather than trying to force to a condition
   - important here too is to drop the corresponding index 

## Cleaning

In [None]:
#show nulls
display(X[X.isnull().any(axis=1)])
# Y and Z are fine 
display(Y[Y.isnull().any(axis=1)])
display(Z[Z.isnull().any(axis=1)])


In [None]:
X = X.dropna()

In [None]:
#check for duplicates
print('X duplicates:',X.duplicated().sum())
print('Y duplicates:',Y.duplicated().sum())
print('Z duplicates:',Z.duplicated().sum())

## Some exploratory plotting showing the main characteristicts of the data 

In [None]:
#helper function to generate plots 
def plot_data(data, title):
    # histogram
    plt.figure(figsize=(10,5))
    data.hist(bins=50, figsize=(10,10))
    plt.suptitle(f"{title} Histogram", fontsize=16)
    plt.show()
    # boxplot
    plt.figure(figsize=(10,5))
    sns.boxplot(data=data)
    plt.suptitle(f"{title} Boxplot", fontsize=16)
    plt.show()
    # scatterplot
    plt.figure(figsize=(10,5))
    sns.scatterplot(data=data)
    plt.suptitle(f"{title} Scatterplot", fontsize=16)
    # add key for the columns 
    plt.legend(title=title)
    plt.show()
    #corr matrix
    plt.figure(figsize=(10,5))
    sns.heatmap(data.corr(), annot=True)
    plt.suptitle(f"{title} Correlation Matrix", fontsize=16)
    plt.show()
    #pair plot 
    sns.pairplot(data)
    plt.suptitle(f"{title} Pairplot", fontsize=16)
    plt.show()
    

In [None]:
plot_data(X, 'X [pre cleaning]')
# there seems to be some outliers in the data that we can clean up 

In [None]:
# removed outliers by winsoring the data
X_cleaned = X.clip(lower=X.quantile(0.01), upper=X.quantile(0.95), axis=1)
# drop the index in y and z of the windsored data
Y_cleaned = Y.loc[X_cleaned.index]
Z_cleaned = Z.loc[X_cleaned.index]
plot_data(X_cleaned, 'X [post cleaning]')

### Notes: 
- After removing outliers that seems to have improved the plotting significantly as we can see the more nuanced variance in the data set 
- of note is that in the scatterplot there seems to be some clear separation between the columns of the dataset possibly allowing for some explantion in the variance 

In [None]:
# take a look at Y 
plot_data(Y, 'Y')
# corr and pairplot not necessary but included for completeness
# Y seems cleaner with no outliers

## Forecasting Model of Y based on X 

In [None]:
# linear 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cleaned)
# The data doesn't look brownian so i'm going to assume that its not time series 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_cleaned, test_size=0.2, random_state=42)

mean_prediction = np.mean(y_train)
mse_baseline = mean_squared_error(y_test, [mean_prediction] * len(y_test))
print(f'MSE of baseline (mean predictor): {mse_baseline}')
# this needs a lot of work 

## Model Selection
- as this data is purely numeric with no hints as to the context i'm doing a broad selection on models to find the best MSE before I optimize 

In [None]:
# lets try a few different models
from sklearn.ensemble import RandomForestRegressor # random forest
from sklearn.neighbors import KNeighborsRegressor # knn (don't think this will work well but lets try it)
from sklearn.svm import SVR 
from sklearn.linear_model import Lasso # l1
from sklearn.linear_model import Ridge # l2 
from sklearn.linear_model import ElasticNet # mixture
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# create a pipeline for each model
pipelines = {
    'rf': Pipeline([('rf', RandomForestRegressor(random_state=42))]),
    'knn': Pipeline([('knn', KNeighborsRegressor())]),
    'svr': Pipeline([('svr', SVR())]),
    'lasso': Pipeline([('lasso', Lasso())]),
    'ridge': Pipeline([('ridge', Ridge())]),
    'elastic': Pipeline([('elastic', ElasticNet())]),
    'gb': Pipeline([('gb', GradientBoostingRegressor())]),
    'pca' : Pipeline([('pca', PCA()), ('rf', RandomForestRegressor(random_state=42))])
}

# create a parameter grid for each model
param_grids = {
    'rf': {'rf__n_estimators': [10, 100, 1000], 'rf__max_depth': [None, 5, 10, 15, 20], 'rf__min_samples_split': [2, 5, 10]},
    'knn': {'knn__n_neighbors': [3, 5, 7, 9], 'knn__weights': ['uniform', 'distance']},
    'svr': {'svr__C': [0.1, 1, 10], 'svr__kernel': ['linear', 'poly', 'rbf']},
    'lasso': {'lasso__alpha': [0.1, 1, 10]},
    'ridge': {'ridge__alpha': [0.1, 1, 10]},
    'elastic': {'elastic__alpha': [0.1, 1, 10], 'elastic__l1_ratio': [0.1, 0.5, 0.9]},
    'gb': {'gb__n_estimators': [10, 100, 1000], 'gb__learning_rate': [0.001, 0.01, 0.1], 'gb__max_depth': [3, 5, 7]},
    'pca': {'pca__n_components': [2, 5, 10, 15, 20]}
}

In [None]:
# grid search on features and models 
feature_combinations = [
    [0,1,2,3],
    [0,1,2],
    [0,1,3],
    [0,2,3],
    [1,2,3],
    [0,1],
    [0,2],
    [0,3],
    [1,2],
    [1,3],
    [2,3],
    [0],
    [1],
    [2],
    [3]
]

best_model = None
best_mse = float('inf')
best_features = None
for features in feature_combinations:
    X_train, X_test, y_train, y_test = train_test_split(X_scaled[:, features], Y_cleaned, test_size=0.2, random_state=42)
    for model_name, pipeline in pipelines.items():
        print(f"Running GridSearchCV for {model_name} with features {features}")
        grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train.values.ravel()) # fits dimensionality better
        print(f"Best parameters: {grid_search.best_params_}")
        y_pred = grid_search.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print(f"Mean Squared Error: {mse}")
        print('=====================')
        if mse < best_mse:
            best_mse = mse
            best_model = model_name
            best_features = features

In [None]:
# train the models
for name, model in models.items():
    model.fit(X_train_scaled_feature_clip, y_train)
    y_pred = model.predict(X_test_scaled[:,features])
    mse = mean_squared_error(y_test, y_pred)
    print(f'{name} MSE: {mse}')