In [212]:
import pandas as pd
from tqdm import tqdm
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from datetime import datetime

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import matplotlib as mpl

# **Preprocessing**

In [213]:
# Step 0 : filter on small caps and micro caps
#def filter_data(df, small_cap_value=1, micro_cap_value=1):
#    """Filters rows based on small_cap and micro_cap values and returns a copy of the filtered DataFrame."""
#    filtered_df = df[(df['small_cap'] == small_cap_value) & (df['micro_cap'] == micro_cap_value)].copy()
#    return filtered_df

In [214]:
# Step 1 : target creation + train_test_split
# Creating target variables to automate creation of quarterly, yearly and 2-yearly targets, because well, DON'T REPEAT YOURSELF!
def create_target_variable(df, frequency:int, threshold):
    if frequency == 1:
        col = 'mc_qtr_growth_pct'
    if frequency == 4:
        col = 'mc_yr_growth_pct'
    if frequency == 8:
        col = 'mc_2yr_growth_pct'
   #else:
   #    raise ValueError("Invalid frequency. Use 1 (quarterly), 4 (yearly), or 8 (2-year).")
    df[col] = df[col].shift(-frequency)
    df.dropna(subset=col, inplace=True)
    target_func = lambda x: 1 if ((x[col] > threshold) & (x.small_cap == 1)) else 0
    df['target'] = df.apply(target_func, axis=1)
    return df

In [215]:
def drop_columns(df, cols_to_drop=None):
    """Drops specified columns from the DataFrame."""
    if cols_to_drop is None:
        # Default columns to drop if none are specified
        cols_to_drop = ['cik', 'CIK', 'date', 'stprba', 'quarter', 'year']
    return df.drop(cols_to_drop, axis=1, errors='ignore')


In [216]:
# Creating a custom function for the group split
def group_train_test_split(data, test_size=0.2, random_state=None):
    # We split by groups (company ticker) while keeping the data structure intact.
    unique_groups = data['TICKER'].unique()
    train_groups, test_groups = train_test_split(unique_groups, test_size=test_size, random_state=random_state)

    # Split into train and test sets
    X_train = data[data['TICKER'].isin(train_groups)]
    X_test = data[data['TICKER'].isin(test_groups)]

    # Define columns to drop: Ticker, cik, date, quarter, year + growth columns
    cols_to_drop = ['mc_qtr_growth', 'mc_qtr_growth_pct', 'mc_yr_growth', 'mc_yr_growth_pct', 'mc_2yr_growth', 'mc_2yr_growth_pct']

    # Drop unwanted columns
    X_train = drop_columns(X_train, cols_to_drop + ['cik', 'CIK', 'date', 'stprba', 'quarter', 'year'])
    X_test = drop_columns(X_test, cols_to_drop + ['cik', 'CIK', 'date', 'stprba', 'quarter', 'year'])

    # Extract the target variable from the dataset
    y_train = data[data['TICKER'].isin(train_groups)]['target']
    y_test = data[data['TICKER'].isin(test_groups)]['target']

    return X_train, X_test, y_train, y_test


In [217]:
# Step 2: Identify numerical and categorical features
def identify_feature_types(df):
    """Identifies the numerical and categorical columns in the DataFrame."""
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()

    # Exclude 'Ticker' from categorical features as it's not needed for transformation
    if 'TICKER' in categorical_features:
        categorical_features.remove('TICKER')

    return numerical_features, categorical_features

In [218]:
# Step 3: Create preprocessing pipeline for numerical and categorical features
def create_preprocessing_pipeline(numerical_features, categorical_features):
    """Creates the preprocessing pipeline for numerical and categorical features."""
    # Preprocessing for numerical data: RobustScaler to make our numbers más robusto.
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Handle NaNs
        ('scaler', RobustScaler())  # Scale the data
    ])

    # Preprocessing for categorical data: OneHotEncoder to give each category its own columm...
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categories
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Encode categories
    ])

    # Combine the transformers into one big ColumnTransformer.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        #remainder='passthrough'  # Columns not specified in 'num' or 'cat' will be passed through unmodified
    )
    return preprocessor

In [219]:
# Step 4: Function to preprocess data in training mode (fitting the pipeline)
def preprocess_training_data(X_train, preprocessor=None):
    """Fits and transforms the training data using the provided pipeline."""
    if preprocessor is None:
        # Identify feature types
        numerical_features, categorical_features = identify_feature_types(X_train)
        preprocessor = create_preprocessing_pipeline(numerical_features, categorical_features)

    # Fit and transform the training data
    X_train_processed = preprocessor.fit_transform(X_train)

    return X_train_processed, preprocessor

In [220]:
# Step 5: Function to preprocess new/unseen/test data in production mode (only transforming)
def preprocess_new_data(X_new, preprocessor):
    """Transforms new/unseen/test data using a pre-fitted pipeline."""
    if preprocessor is None:
        raise ValueError("The preprocessor must be fitted on training data first before transforming new data.")

    # Transform the new data (no fitting here)
    X_new_processed = preprocessor.transform(X_new)
    return X_new_processed

# **Training**

In [221]:
def save_model(model, model_type, model_dir='~/models/'):
    """Saves the trained model with a timestamp."""
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    model_filename = f'{model_type}_{timestamp}.pkl'

    # Ensure model directory exists
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Save the trained model
    model_path = os.path.join(model_dir, model_filename)
    with open(model_path, 'wb') as f_model:
        pickle.dump(model, f_model)

    print(f"Model saved to: {model_path}")
    return model_path

def evaluate_model(model, X_train, y_train, X_test, y_test, scoring_metrics=['accuracy', 'precision', 'recall', 'f1']):
    """Evaluates the model with cross-validation and test set metrics."""
    cv_metrics = {}
    for metric in scoring_metrics:
        with tqdm(total=5, desc=f"Cross-Validation ({metric})", bar_format='{l_bar}{bar} [elapsed: {elapsed} left: {remaining}]') as pbar:
            cv_metrics[metric] = cross_val_score(model, X_train, y_train, cv=5, scoring=metric)
            pbar.update(5)

    print(f"Cross-validated Metrics: {', '.join([f'{m}: {cv_metrics[m].mean():.4f}' for m in cv_metrics])}")

    # Test on the test set
    y_pred_test = model.predict(X_test)

    # Calculate test set metrics
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_pred_test),
        'precision': precision_score(y_test, y_pred_test),
        'recall': recall_score(y_test, y_pred_test),
        'f1': f1_score(y_test, y_pred_test)
    }

    # Combine cross-validated and test metrics
    metrics = {**{f'cv_{m}': cv_metrics[m].mean() for m in cv_metrics}, **test_metrics}
    return metrics

def train_logistic_regression_and_save(X_train, y_train, X_test, y_test, model_dir='~/models/'):
    """Trains, evaluates a logistic regression model, saves the trained model, and returns evaluation metrics."""

    model_type = 'logistic_regression'
    model = LogisticRegression(C=0.001, max_iter=2000, solver='lbfgs')

    # Train model with a progress bar
    with tqdm(total=100, desc=f"Training {model_type}", bar_format='{l_bar}{bar} [elapsed: {elapsed} left: {remaining}]') as pbar:
        model.fit(X_train, y_train)
        pbar.update(100)

    # Check number of iterations
    print(f"Number of iterations: {model.n_iter_}")

    # Evaluate the model
    metrics = evaluate_model(model, X_train, y_train, X_test, y_test)

    # Save the model
    save_model(model, model_type, model_dir)

    return metrics, model

In [222]:
def save_model(model, model_type, model_dir='~/models/'):
    """Saves the trained model with a timestamp."""
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    model_filename = f'{model_type}_{timestamp}.pkl'

    # Ensure model directory exists
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Save the trained model
    model_path = os.path.join(model_dir, model_filename)
    with open(model_path, 'wb') as f_model:
        pickle.dump(model, f_model)

    print(f"Model saved to: {model_path}")
    return model_path

def evaluate_model(model, X_train, y_train, X_test, y_test, scoring_metrics=['accuracy', 'precision', 'recall', 'f1']):
    """Evaluates the model with cross-validation and test set metrics."""
    cv_metrics = {}
    for metric in scoring_metrics:
        with tqdm(total=5, desc=f"Cross-Validation ({metric})", bar_format='{l_bar}{bar} [elapsed: {elapsed} left: {remaining}]') as pbar:
            cv_metrics[metric] = cross_val_score(model, X_train, y_train, cv=5, scoring=metric)
            pbar.update(5)

    print(f"Cross-validated Metrics: {', '.join([f'{m}: {cv_metrics[m].mean():.4f}' for m in cv_metrics])}")

    # Test on the test set
    y_pred_test = model.predict(X_test)

    # Calculate test set metrics
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_pred_test),
        'precision': precision_score(y_test, y_pred_test),
        'recall': recall_score(y_test, y_pred_test),
        'f1': f1_score(y_test, y_pred_test)
    }

    # Combine cross-validated and test metrics
    metrics = {**{f'cv_{m}': cv_metrics[m].mean() for m in cv_metrics}, **test_metrics}
    return metrics


def train_knn_and_save(X_train, y_train, X_test, y_test, model_dir='~/models/'):
    """Trains, evaluates a K-Nearest Neighbors model, saves the trained model, and returns evaluation metrics."""

    model_type = 'knn'
    knn = KNeighborsClassifier()

    # Train model with a progress bar
    with tqdm(total=100, desc=f"Training {model_type}", bar_format='{l_bar}{bar} [elapsed: {elapsed} left: {remaining}]') as pbar:
        knn.fit(X_train, y_train)
        pbar.update(100)

    # Evaluate the model
    metrics = evaluate_model(knn, X_train, y_train, X_test, y_test)

    # Save the model
    save_model(knn, model_type, model_dir)

    return metrics, knn

# Grid search

In [171]:
def run_grid_search(X_train, y_train):
    """Runs a grid search on logistic regression model to find the best hyperparameters."""

    # Define the parameter grid for Logistic Regression
    param_grid = {
        'solver': ['saga', 'lbfgs'],  # Different solvers
        'max_iter': [1500, 2000, 2500],  # Number of iterations
        'C': [0.005, 0.007, 0.01, 0.12, 0.15]  # Regularization strength
    }

    # Create a Logistic Regression model
    logistic_model = LogisticRegression()

    # Set up the GridSearchCV
    grid_search = GridSearchCV(
        estimator=logistic_model,
        param_grid=param_grid,
        scoring='precision',  # Choose appropriate scoring metric
        cv=5,  # Number of cross-validation folds
        n_jobs=-1,  # Use all available cores
        verbose=1  # Verbosity level
    )

    # Fit Grid Search
    grid_search.fit(X_train, y_train)

    # Best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best parameters: {best_params}")
    print(f"Best cross-validation score: {best_score:.4f}")

    # Get the best model
    best_model = grid_search.best_estimator_

    return best_model, best_params, best_score

# **Running the functions on the dataset**

In [223]:

# preprocessor import create_target_variable, group_train_test_split, identify_feature_types, create_preprocessing_pipeline,preprocess_training_data, preprocess_new_data,train_logistic_regression, filter_data


df = pd.read_csv('~/Small-Cap-Scout/raw_data/data_for_preprocessing.csv', index_col=[0])


In [224]:
df.head()

Unnamed: 0,cik,date,Assets,AssetsCurrent,Cash,AssetsNoncurrent,Liabilities,LiabilitiesCurrent,LiabilitiesNoncurrent,Equity,...,TICKER,market_cap,mc_qtr_growth,mc_qtr_growth_pct,mc_yr_growth,mc_yr_growth_pct,mc_2yr_growth,mc_2yr_growth_pct,small_cap,micro_cap
0,1750,2011-02-28,1655991000.0,927839000.0,54716000.0,409295000.0,851395000.0,419182000.0,432213000.0,804596000.0,...,AIR,1045.889727,46.783392,0.046825,77.281557,0.079786,,,1,0
1,1750,2011-05-31,1703727000.0,913985000.0,57433000.0,465365000.0,868438000.0,416010000.0,452428000.0,835289000.0,...,AIR,1024.472219,-21.417508,-0.020478,306.796787,0.427487,,,1,0
2,1750,2011-08-31,1752372000.0,944247000.0,35523000.0,472856000.0,903243000.0,350085000.0,553158000.0,849129000.0,...,AIR,882.619592,-141.852627,-0.138464,255.395538,0.407184,,,1,0
3,1750,2011-11-30,1821612000.0,955053000.0,27870000.0,521431000.0,958220000.0,374944000.0,583276000.0,863392000.0,...,AIR,727.886752,-154.73284,-0.175311,-271.219583,-0.271462,,,1,0
4,1750,2012-02-29,2220293000.0,1065389000.0,59294000.0,797765000.0,1328974000.0,560986000.0,767988000.0,891319000.0,...,AIR,899.522315,171.635564,0.2358,-146.367411,-0.139945,-69.085854,-0.071325,1,0


In [225]:
df.columns

Index(['cik', 'date', 'Assets', 'AssetsCurrent', 'Cash', 'AssetsNoncurrent',
       'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent', 'Equity',
       'HolderEquity', 'RetainedEarnings', 'AdditionalPaidInCapital',
       'TreasuryStockValue', 'TemporaryEquity', 'RedeemableEquity',
       'LiabilitiesAndEquity', 'Revenues', 'CostOfRevenue', 'GrossProfit',
       'OperatingExpenses', 'OperatingIncomeLoss',
       'IncomeLossFromContinuingOperationsBeforeIncomeTaxExpenseBenefit',
       'AllIncomeTaxExpenseBenefit', 'IncomeLossFromContinuingOperations',
       'IncomeLossFromDiscontinuedOperationsNetOfTax', 'ProfitLoss',
       'NetIncomeLossAttributableToNoncontrollingInterest', 'NetIncomeLoss',
       'NetCashProvidedByUsedInOperatingActivitiesContinuingOperations',
       'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations',
       'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations',
       'NetCashProvidedByUsedInOperatingActivities',
       'NetCa

In [226]:
# Step 2: Drop unwanted columns before target creation
df_cleaned = drop_columns(df, cols_to_drop=['cik', 'CIK', 'date', 'stprba', 'quarter', 'year'])

In [227]:
# Step 3: Create target variables and split the data
df_qtr = create_target_variable(df_cleaned, frequency=1, threshold=0.5)
df_yr = create_target_variable(df_cleaned, frequency=4, threshold=0.5)
df_2yr = create_target_variable(df_cleaned, frequency=8, threshold=0.5)

In [228]:
X_train_qtr, X_test_qtr, y_train_qtr, y_test_qtr = group_train_test_split(df_qtr)
X_train_yr, X_test_yr, y_train_yr, y_test_yr = group_train_test_split(df_yr)
X_train_2yr, X_test_2yr, y_train_2yr, y_test_2yr = group_train_test_split(df_2yr)

In [229]:
# Step 4:  Identify feature types after splitting
numerical_features_qtr, categorical_features_qtr = identify_feature_types(X_train_qtr)
numerical_features_yr, categorical_features_yr = identify_feature_types(X_train_yr)
numerical_features_2yr, categorical_features_2yr = identify_feature_types(X_train_2yr)

In [230]:
# Step 5: Create the preprocessing pipeline
preprocessor_qtr = create_preprocessing_pipeline(numerical_features_qtr, categorical_features_qtr)
preprocessor_yr = create_preprocessing_pipeline(numerical_features_yr, categorical_features_yr)
preprocessor_2yr = create_preprocessing_pipeline(numerical_features_2yr, categorical_features_2yr)


In [231]:
# Step 6: Preprocess the training data
X_train_qtr_processed, preprocessor_qtr = preprocess_training_data(X_train_qtr, preprocessor=preprocessor_qtr)
X_train_yr_processed, preprocessor_yr = preprocess_training_data(X_train_yr, preprocessor=preprocessor_yr)
X_train_2yr_processed, preprocessor_2yr = preprocess_training_data(X_train_2yr, preprocessor=preprocessor_2yr)


In [232]:
# Step 7 : then the test data
X_test_qtr_processed = preprocess_new_data(X_test_qtr, preprocessor_qtr)
X_test_yr_processed = preprocess_new_data(X_test_yr, preprocessor_yr)
X_test_2yr_processed = preprocess_new_data(X_test_2yr, preprocessor_2yr)

In [182]:
# Step 8 :Train for quarterly (frequency=1), yearly (frequency=4), and 2-year (frequency=8) predictions
y_pred_qtr, model_qtr = train_logistic_regression_and_save(X_train_qtr_processed, y_train_qtr, X_test_qtr_processed, y_test_qtr)
y_pred_yr, model_yr = train_logistic_regression_and_save(X_train_yr_processed, y_train_yr, X_test_yr_processed, y_test_yr)
y_pred_2yr, model_2yr = train_logistic_regression_and_save(X_train_2yr_processed, y_train_2yr, X_test_2yr_processed, y_test_2yr)


Training logistic_regression: 100%|██████████ [elapsed: 00:05 left: 00:00]


Number of iterations: [522]


Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:19 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:20 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:20 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:19 left: 00:00]


Cross-validated Metrics: accuracy: 0.6693, precision: 0.3502, recall: 0.5252, f1: 0.4107
Model saved to: ~/models/logistic_regression_2024-09-09_23-25-20.pkl


Training logistic_regression: 100%|██████████ [elapsed: 00:05 left: 00:00]


Number of iterations: [449]


Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:14 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:14 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:14 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:14 left: 00:00]


Cross-validated Metrics: accuracy: 0.6653, precision: 0.3492, recall: 0.5252, f1: 0.4107
Model saved to: ~/models/logistic_regression_2024-09-09_23-26-24.pkl


Training logistic_regression: 100%|██████████ [elapsed: 00:02 left: 00:00]


Number of iterations: [147]


Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:08 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:07 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:08 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:07 left: 00:00]

Cross-validated Metrics: accuracy: 0.6605, precision: 0.3446, recall: 0.5406, f1: 0.4137
Model saved to: ~/models/logistic_regression_2024-09-09_23-26-57.pkl





In [233]:
# Step 8 :Train for quarterly (frequency=1), yearly (frequency=4), and 2-year (frequency=8) predictions
y_pred_qtr, model_qtr = train_knn_and_save(X_train_qtr_processed, y_train_qtr, X_test_qtr_processed, y_test_qtr)
y_pred_yr, model_yr = train_knn_and_save(X_train_yr_processed, y_train_yr, X_test_yr_processed, y_test_yr)
y_pred_2yr, model_2yr = train_knn_and_save(X_train_2yr_processed, y_train_2yr, X_test_2yr_processed, y_test_2yr)


Training knn: 100%|██████████ [elapsed: 00:00 left: 00:00]
Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:11 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:11 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:11 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:11 left: 00:00]


Cross-validated Metrics: accuracy: 0.8196, precision: 0.6458, recall: 0.4181, f1: 0.5059
Model saved to: ~/models/knn_2024-09-10_17-15-28.pkl


Training knn: 100%|██████████ [elapsed: 00:00 left: 00:00]
Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:12 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:11 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:11 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:11 left: 00:00]


Cross-validated Metrics: accuracy: 0.8248, precision: 0.6522, recall: 0.4303, f1: 0.5168
Model saved to: ~/models/knn_2024-09-10_17-16-19.pkl


Training knn: 100%|██████████ [elapsed: 00:00 left: 00:00]
Cross-Validation (accuracy): 100%|██████████ [elapsed: 00:12 left: 00:00]
Cross-Validation (precision): 100%|██████████ [elapsed: 00:13 left: 00:00]
Cross-Validation (recall): 100%|██████████ [elapsed: 00:16 left: 00:00]
Cross-Validation (f1): 100%|██████████ [elapsed: 00:20 left: 00:00]


Cross-validated Metrics: accuracy: 0.8223, precision: 0.6478, recall: 0.4195, f1: 0.5071
Model saved to: ~/models/knn_2024-09-10_17-17-29.pkl


In [None]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
def plot_metrics(history):
  metrics = ['loss', 'accuracy', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])
    plt.legend()

In [30]:
'''# Step 9: Save models to the /Small-Cap-Scout/models folder
import os
import pickle

# Define the directory path in Google Drive
model_dir = '~/models/'

# Check if the directory exists, if not, create it
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save models
with open(os.path.join(model_dir, 'model_qtr.pkl'), 'wb') as f_qtr:
    pickle.dump(model_qtr, f_qtr)

with open(os.path.join(model_dir, 'model_yr.pkl'), 'wb') as f_yr:
    pickle.dump(model_yr, f_yr)

with open(os.path.join(model_dir, 'model_2yr.pkl'), 'wb') as f_2yr:
    pickle.dump(model_2yr, f_2yr)'''

In [27]:
# Step 10: Print metrics for each model
print("1 Quarter Ahead Metrics:", y_pred_qtr)
print("1 Year Ahead Metrics:", y_pred_yr)
print("2 Years Ahead Metrics:", y_pred_2yr)

1 Quarter Ahead Metrics: {'cv_accuracy': np.float64(0.5861177525890302), 'cv_precision': np.float64(0.28886531098782686), 'cv_recall': np.float64(0.3746608634582686), 'cv_f1': np.float64(0.32563369780978374), 'accuracy': 0.5739110464671209, 'precision': np.float64(0.28430531732418524), 'recall': np.float64(0.3731007315700619), 'f1': np.float64(0.32270625456315405)}
1 Year Ahead Metrics: {'cv_accuracy': np.float64(0.5847497987573081), 'cv_precision': np.float64(0.2876763455297774), 'cv_recall': np.float64(0.37019406810692057), 'cv_f1': np.float64(0.3235231859899204), 'accuracy': 0.6013502387617322, 'precision': np.float64(0.3058765674944677), 'recall': np.float64(0.3812442537542139), 'f1': np.float64(0.33942701227830835)}
2 Years Ahead Metrics: {'cv_accuracy': np.float64(0.5855911208007135), 'cv_precision': np.float64(0.2907180862789915), 'cv_recall': np.float64(0.378417465607723), 'cv_f1': np.float64(0.32827805522691694), 'accuracy': 0.6088211708099439, 'precision': np.float64(0.304605

In [67]:
# Let's try a grid search - QUARTER
best_model_qtr, best_params_qtr, best_score_qtr = run_grid_search(X_train_qtr_processed, y_train_qtr)

print(f"Best Model - QUARTER: {best_model_qtr.__class__.__name__}")
print(f"Best Parameters - QUARTER: {best_params_qtr}")
print(f"Best Score - QUARTER: {best_score_qtr:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits




In [136]:
# Let's try a grid search - YEAR
best_model_yr, best_params_yr, best_score_yr = run_grid_search(X_train_yr_processed, y_train_yr)

print(f"Best Model - YEAR: {best_model_yr.__class__.__name__}")
print(f"Best Parameters - YEAR: {best_params_yr}")
print(f"Best Score - YEAR: {best_score_yr:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters: {'C': 0.15, 'max_iter': 1500, 'solver': 'lbfgs'}
Best cross-validation score: 0.3543
Best Model - YEAR: LogisticRegression
Best Parameters - YEAR: {'C': 0.15, 'max_iter': 1500, 'solver': 'lbfgs'}
Best Score - YEAR: 0.3543


In [None]:
# Let's try a grid search - 2 YEAR
best_model_2yr, best_params_2yr, best_score_2yr = run_grid_search(X_train_2yr_processed, y_train_2yr)

print(f"Best Model - YEAR: {best_model_2yr.__class__.__name__}")
print(f"Best Parameters - YEAR: {best_params_2yr}")
print(f"Best Score - YEAR: {best_score_2yr:.4f}")