### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Load the Modeling Datset

In [None]:
df = pd.read_csv(r"C:\Users\estee\OneDrive\Desktop\Amdari\Project 1 - Stock_Mkt_Trend Analysis\Stock-analysis\data\modeling_dataset.csv")
df.head()


In [None]:
df.shape

In [None]:
df.info()

In [None]:
# To change date to pandas datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.info()

### Define Features and Target

In [None]:
target_col = 'trend_label'

# To exclude non-predictive columns
feature_cols = [col for col in df.columns if col not in ['ticker', 'date', target_col]]

print("Number of features:", len(feature_cols))
print("Sample features:", feature_cols[:10])


### Train-Test Split

In [None]:
# Temporal Split: 70% Train, 30% Test

split_date = df['date'].quantile(0.7)
train_df = df[df['date'] <= split_date].copy()
test_df  = df[df['date'] > split_date].copy()

print("Training shape:", train_df.shape)
print("Testing shape:", test_df.shape)
print("Split date threshold:", split_date)

### Observation of Findings

The final modeling dataset retains ticker and date columns to preserve traceability and temporal context during the train-test split.

The date column ensures that chronological order is maintained during the 70/30 temporal split, preventing data leakage.

The ticker column allows post-modeling analysis by company and supports potential sectoral comparisons.

Both columns will be excluded before model training but are retained in the preparation stage for reproducibility and transparency.

### Train-Test Split Statistics

In [None]:
# Record counts and date ranges
train_start, train_end = train_df['date'].min(), train_df['date'].max()
test_start, test_end = test_df['date'].min(), test_df['date'].max()

print(f"Train period: {train_start.date()} - {train_end.date()}  ({len(train_df)} records)")
print(f"Test period:  {test_start.date()} - {test_end.date()}  ({len(test_df)} records)")

# Class distribution in train and test
print("\nTrain class distribution:")
print(train_df['trend_label'].value_counts(normalize=True).round(3))

print("\nTest class distribution:")
print(test_df['trend_label'].value_counts(normalize=True).round(3))


### Observation of Findings

The class proportions are consistent across training and testing sets.

There is no major imbalance, meaning all three market states (Uptrend, Sideways, and Downtrend) are well represented.

Minor differences are expected due to time variation in market behavior but remain within acceptable limits (<5%).

### Encode Categorical Data

In [None]:
encoder = LabelEncoder()
train_df['sector'] = encoder.fit_transform(train_df['sector'])
test_df['sector']  = encoder.transform(test_df['sector'])

In [None]:
train_df['trend_label'] = encoder.fit_transform(train_df['trend_label'])
test_df['trend_label']  = encoder.transform(test_df['trend_label'])

### Observation of Findings

The target variable - trend_label (Uptrend, Downtrend, Sideways) and sector are categorical.
A LabelEncoder was used to convert these categories into numerical class labels.

- The encoder was fitted on the training labels only

- The learned mapping was then applied to the test labels

This preserves temporal integrity and avoids leakage.

### Scale Numerical features

In [None]:
# Define numerical features
num_cols = df.select_dtypes(include=['int64', 'float64']).columns


### Observation of Findings

All numerical indicator columns (e.g., SMA, EMA, MACD, ATR, RSI, volatility, momentum) were automatically detected using select_dtypes. These features require scaling to ensure uniform magnitude across inputs.

In [None]:
# To check for skewness
df[num_cols].skew().sort_values()


### Observation of Findings

A skewness check revealed that several features had moderate to strong positive skew (e.g., ATR, true range, SMA_200, EMA_12, close price). Financial indicators often display heavy-tailed distributions.

This informed the choice of scaler (RobustScaler)

In [None]:
num_cols

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols]  = scaler.transform(test_df[num_cols])


### Observation of Findings

Because of the presence of skewed distributions and outliers in the data, a RobustScaler was selected.
- scaler.fit_transform() applied to training features only
- scaler.transform() applied to test features

This preserves consistency between datasets.

### Build Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

X_train = train_df[num_cols]
y_train = train_df[target_col]

X_test = test_df[num_cols]
y_test = test_df[target_col]

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

preds = model.predict(X_test)

### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:\n", classification_report(y_test, preds, target_names=encoder.classes_))


### Predictable Trend Type

In [None]:
report = classification_report(y_test, preds, target_names=encoder.classes_, output_dict=True)
pd.DataFrame(report).T


### Result visualization

In [None]:
import seaborn as sns

cm = confusion_matrix(y_test, preds)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")

plt.show()
