In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Step 1: Load y_trains, y_tests, X_train and X_test from CSV files
y_train_qtr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_train_qtr.csv')
y_train_yr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_train_yr.csv')
y_train_2yr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_train_2yr.csv')

y_test_qtr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_test_qtr.csv')
y_test_yr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_test_yr.csv')
y_test_2yr = pd.read_csv('~/Small-Cap-Scout/raw_data/y_test_2yr.csv')

X_train_processed = pd.read_csv('~/Small-Cap-Scout/raw_data/X_train_processed.csv')
X_test_processed = pd.read_csv('~/Small-Cap-Scout/raw_data/X_test_processed.csv')

# Step 2: Convert growth to binary classification for training data
y_train_qtr_class = (y_train_qtr > 0.5).astype(int) # 1 if growth > 50%, else 0
y_train_yr_class = (y_train_yr > 0.5).astype(int)
y_train_2yr_class = (y_train_2yr > 0.5).astype(int)

# Convert growth to binary classification for test data
y_test_qtr_class = (y_test_qtr > 0.5).astype(int)
y_test_yr_class = (y_test_yr > 0.5).astype(int)
y_test_2yr_class = (y_test_2yr > 0.5).astype(int)

# Step 4: Train logistic regression for 1 quarter ahead (using mc_qtr_growth)
logistic_model_qtr = LogisticRegression(max_iter=1000)
logistic_model_qtr.fit(X_train_processed, y_train_qtr_class)

# Step 5: Train logistic regression for 1 year ahead (using mc_yr_growth)
logistic_model_yr = LogisticRegression(max_iter=1000)
logistic_model_yr.fit(X_train_processed, y_train_yr_class)

# Step 6: Train logistic regression for 2 years ahead (using mc_2yr_growth)
logistic_model_2yr = LogisticRegression(max_iter=1000)
logistic_model_2yr.fit(X_train_processed, y_train_2yr_class)

# Step 6: Evaluate using cross-validation
#1 quarter ahead
cv_scores_qtr = cross_val_score(logistic_model_qtr, X_train_processed, y_train_qtr_class, cv=5, scoring='accuracy')

#1 year ahead
cv_scores_yr = cross_val_score(logistic_model_yr, X_train_processed, y_train_yr_class, cv=5, scoring='accuracy')

#2 years ahead
cv_scores_2yr = cross_val_score(logistic_model_2yr, X_train_processed, y_train_2yr_class, cv=5, scoring='accuracy')

# Step 7: Test on the test sets
y_pred_test_qtr = logistic_model_qtr.predict(X_test_processed)

y_pred_test_yr = logistic_model_yr.predict(X_test_processed)

y_pred_test_2yr = logistic_model_2yr.predict(X_test_processed)

# Step 8: Print performance metrics
print(f"1 quarter ahead Cross-validated Accuracy: {cv_scores_qtr.mean():.4f}\n")

  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [None]:
print(f"1 year ahead Cross-validated Accuracy: {cv_scores_yr.mean():.4f}\n")

In [None]:
print(f"2 years ahead Cross-validated Accuracy: {cv_scores_2yr.mean():.4f}\n")

In [None]:
print("\nClassification Report on 1 quarter ahead Test Set:\n")

print(classification_report(y_test_qtr_class, y_pred_test_qtr))

In [None]:
print("\nClassification Report on 1 year ahead Test Set:\n")

print(classification_report(y_test_yr_class, y_pred_test_yr))

In [None]:
print("\nClassification Report on 2 years ahead Test Set:\n")

print(classification_report(y_test_2yr_class, y_pred_test_2yr))