<a href="https://colab.research.google.com/github/datascientist-ld1981/Finance-StockPrediction-/blob/main/stocksipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**STOCK & FINANCES**

**Data Collection from Apple Vantage **

The program retrieves and merges historical stock price data with detailed financial metrics for a specified company. Using the yfinance library, it fetches stock data, including open, high, low, close prices, and trading volume, over a defined date range. Simultaneously, it utilizes the Alpha Vantage API to collect comprehensive financial details, such as Market Cap, PE Ratio, Revenue, Net Income, Debt to Equity, and Profit Margin. The datasets are then aligned and merged based on the company ticker, creating a unified dataset with both historical and financial insights. Finally, the combined data is saved as a CSV file, enabling streamlined analysis for financial research, backtesting, or machine learning applications.



In [None]:
pip install yfinance pandas requests




In [None]:
import yfinance as yf
import pandas as pd
import requests

# Alpha Vantage API key
ALPHA_VANTAGE_API_KEY = "FCXK9F3UX20DM0RU"

def fetch_stock_price_data(ticker, start_date, end_date):
    """
    Fetch historical stock price data using yfinance.
    """
    data = yf.download(ticker, start=start_date, end=end_date)
    data.reset_index(inplace=True)
    data = data.rename(columns={
        "Open": "Open Price",
        "High": "High Price",
        "Low": "Low Price",
        "Close": "Close Price",
        "Adj Close": "Adjusted Close Price",
        "Volume": "Trading Volume"
    })
    return data

def fetch_alpha_vantage_financials(ticker):
    """
    Fetch additional financial data using Alpha Vantage's OVERVIEW endpoint.
    """
    url = f"https://www.alphavantage.co/query"
    params = {
        "function": "OVERVIEW",
        "symbol": ticker,
        "apikey": ALPHA_VANTAGE_API_KEY,
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        financial_data = {
            "Ticker": ticker,
            "Market Cap": data.get("MarketCapitalization", ""),
            "PE Ratio": data.get("PERatio", ""),
            "Beta": data.get("Beta", ""),
            "EPS (Earnings Per Share)": data.get("EPS", ""),
            "Forward PE": data.get("ForwardPE", ""),
            "Revenue": data.get("RevenueTTM", ""),
            "Gross Profit": data.get("GrossProfitTTM", ""),
            "Operating Income": data.get("OperatingIncomeTTM", ""),
            "Net Income": data.get("NetIncomeTTM", ""),
            "Debt to Equity": data.get("DebtEquityRatio", ""),
            "Return on Equity (ROE)": data.get("ReturnOnEquityTTM", ""),
            "Current Ratio": data.get("CurrentRatio", ""),
            "Dividend Yield": data.get("DividendYield", ""),
            "Free Cash Flow": data.get("FreeCashFlowTTM", ""),
            "Profit Margin": data.get("ProfitMargin", ""),
            "Cash Ratio": data.get("CashRatio", ""),
            "Quick Ratio": data.get("QuickRatio", ""),
            "Price to Book Ratio": data.get("PriceToBookRatio", ""),
            "Enterprise Value": data.get("EnterpriseValue", ""),
            "Total Debt": data.get("TotalDebt", ""),
            "Total Assets": data.get("TotalAssets", ""),
            "Total Equity": data.get("TotalShareholderEquity", ""),
            "Trailing Twelve Months (TTM) Revenue": data.get("RevenueTTM", ""),
            "Trailing Twelve Months (TTM) EBITDA": data.get("EBITDA", ""),
            "Trailing Twelve Months (TTM) Earnings": data.get("NetIncomeTTM", ""),
        }
        return financial_data
    else:
        print(f"Failed to fetch Alpha Vantage data for {ticker}. Status Code: {response.status_code}")
        return {}

def merge_data(price_data, financial_data):
    """
    Merge stock price data with financial data.
    """
    if not financial_data:
        print("No financial data to merge.")
        return price_data

    # Ensure price_data has a flat index
    if isinstance(price_data.columns, pd.MultiIndex):
        price_data.columns = [' '.join(col).strip() for col in price_data.columns.values]

    # Convert financial data into a DataFrame
    financial_df = pd.DataFrame([financial_data])

    # Merge the datasets
    merged_data = price_data.merge(financial_df, how="left", left_on="Ticker", right_on="Ticker")
    return merged_data

def main():
    # Parameters
    ticker = "AAPL"  # Example stock ticker for Apple
    start_date = "2022-01-01"
    end_date = "2023-01-01"

    print("Fetching stock price data...")
    price_data = fetch_stock_price_data(ticker, start_date, end_date)
    price_data["Ticker"] = ticker  # Add Ticker for merging

    print("Fetching financial data...")
    financial_data = fetch_alpha_vantage_financials(ticker)

    print("Merging datasets...")
    final_data = merge_data(price_data, financial_data)

    # Save the final dataset to a CSV file
    final_data.to_csv(f"{ticker}_stock_financials.csv", index=False)
    print(f"Merged dataset saved to {ticker}_stock_financials.csv")

if __name__ == "__main__":
    main()


[*********************100%***********************]  1 of 1 completed

Fetching stock price data...
Fetching financial data...





Merging datasets...
Merged dataset saved to AAPL_stock_financials.csv


# **Data Cleaning**


In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the Dataset
try:
    data = pd.read_csv("AAPL_stock_financials.csv")
    if data.empty:
        print("The dataset is empty. Please provide a valid dataset with data entries.")
        exit()
except FileNotFoundError:
    print("The file 'AAPL_stock_financials.csv' was not found. Please check the file path and name.")
    exit()

# 1. Standardize Column Names
data.columns = data.columns.str.strip().str.lower().str.replace(" ", "_")
print("Cleaned Column Names:", data.columns.tolist())

# Mapping for required columns
column_mapping = {
    'open_price_aapl': 'open',
    'high_price_aapl': 'high',
    'low_price_aapl': 'low',
    'close_price_aapl': 'close',
    'trading_volume_aapl': 'volume'
}

# Rename columns based on the mapping
data.rename(columns=column_mapping, inplace=True)

# 2. Verify Required Columns
required_columns = ['open', 'high', 'low', 'close', 'volume']
missing_cols = [col for col in required_columns if col not in data.columns]
if missing_cols:
    print(f"Missing required columns: {missing_cols}")
    for col in missing_cols:
        data[col] = np.nan  # Create missing columns with NaN values.

# 3. Handle Missing Values
for col in required_columns:
    if col in data.columns:
        median_value = data[col].median()
        data[col] = data[col].fillna(median_value)

# 4. Standardize Data Formats
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    print("Date column converted to datetime format.")

data[required_columns] = data[required_columns].apply(pd.to_numeric, errors='coerce')

# 5. Remove Outliers with Reduced Strictness
for col in required_columns:
    if col in data.columns:
        # Relaxed IQR Multiplier
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 2.0 * IQR  # Increased multiplier to 2.0
        upper_bound = Q3 + 2.0 * IQR

        # Filter outliers based on relaxed IQR
        initial_count = len(data)
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
        final_count = len(data)
        print(f"Relaxed IQR: Removed {initial_count - final_count} outliers from column '{col}'.")

        # Relaxed Z-Score Threshold
        z_scores = stats.zscore(data[col], nan_policy='omit')
        data = data[np.abs(z_scores) < 3.5]  # Increased threshold to 3.5

# 6. Save the Cleaned Dataset
if not data.empty:
    data.to_csv("AAPL_stock_financials_cleaned.csv", index=False)
    print("Data cleaning completed successfully. Cleaned file saved as 'AAPL_stock_financials_cleaned.csv'.")
else:
    print("Data cleaning resulted in an empty dataset. Please review the data and processing steps.")


Cleaned Column Names: ['date', 'close_price_aapl', 'high_price_aapl', 'low_price_aapl', 'open_price_aapl', 'trading_volume_aapl', 'ticker', 'market_cap', 'pe_ratio', 'beta', 'eps_(earnings_per_share)', 'forward_pe', 'revenue', 'gross_profit', 'operating_income', 'net_income', 'debt_to_equity', 'return_on_equity_(roe)', 'current_ratio', 'dividend_yield', 'free_cash_flow', 'profit_margin', 'cash_ratio', 'quick_ratio', 'price_to_book_ratio', 'enterprise_value', 'total_debt', 'total_assets', 'total_equity', 'trailing_twelve_months_(ttm)_revenue', 'trailing_twelve_months_(ttm)_ebitda', 'trailing_twelve_months_(ttm)_earnings']
Date column converted to datetime format.
Relaxed IQR: Removed 0 outliers from column 'open'.
Relaxed IQR: Removed 0 outliers from column 'high'.
Relaxed IQR: Removed 0 outliers from column 'low'.
Relaxed IQR: Removed 0 outliers from column 'close'.
Relaxed IQR: Removed 7 outliers from column 'volume'.
Data cleaning completed successfully. Cleaned file saved as 'AAPL_s

# **Exploratory Data Analysis**



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("AAPL_stock_financials_cleaned.csv")

# 1. Check Column Names
print("Column Names in the Dataset: ", data.columns)

# 2. Check if 'close_price_aapl' exists in the dataset
if 'close_price_aapl' not in data.columns:
    print("Error: 'close_price_aapl' column is missing.")
    print("Available columns are: ", data.columns)
    close_price_available = False
else:
    close_price_available = True

# 3. Descriptive Statistics
print("\nSummary Statistics:")
print(data.describe())

# Additional descriptive stats: Mode
print("\nMode of each column:")
print(data.mode().iloc[0])  # First row contains the mode for each column

# 4. Data Visualization (Only if 'close_price_aapl' exists)

if close_price_available:
    # Line Chart: Trend in stock prices over time
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
        plt.figure(figsize=(10, 5))
        plt.plot(data['date'], data['close_price_aapl'], label='Close Price')
        plt.title("Trend in Close Prices Over Time")
        plt.xlabel("Date")
        plt.ylabel("Close Price (AAPL)")
        plt.legend()
        plt.grid()
        plt.show()

    # Correlation Heatmap (Only numerical columns)
    numeric_data = data.select_dtypes(include=[np.number])
    plt.figure(figsize=(12, 8))
    correlation_matrix = numeric_data.corr()  # Compute correlation only for numeric columns
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Correlation Heatmap")
    plt.show()

    # Box Plots: Detect outliers
    for column in ['open_price_aapl', 'high_price_aapl', 'low_price_aapl', 'close_price_aapl', 'trading_volume_aapl']:
        if column in data.columns:
            plt.figure(figsize=(8, 4))
            sns.boxplot(data[column])
            plt.title(f"Box Plot for {column}")
            plt.show()

    # Scatter Plot: Stock price vs. volume
    if 'close_price_aapl' in data.columns and 'trading_volume_aapl' in data.columns:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=data['trading_volume_aapl'], y=data['close_price_aapl'])
        plt.title("Close Price vs. Trading Volume")
        plt.xlabel("Trading Volume")
        plt.ylabel("Close Price")
        plt.show()

    # Histogram: Distribution of Close Price
    data['close_price_aapl'].plot(kind='hist', bins=30, title="Distribution of Close Price", figsize=(8, 4))
    plt.xlabel("Close Price")
    plt.show()

    # Correlation with Close Price
    correlation_with_close = correlation_matrix['close_price_aapl'].sort_values(ascending=False)
    print("\nFeatures most correlated with Close Price:\n", correlation_with_close)

# 5. Train-Test Split (Only if 'close_price_aapl' exists)
if close_price_available:
    # Define features (X) and target (y)
    target = 'close_price_aapl'
    features = data.drop(columns=[target, 'date'], errors='ignore')  # Drop target and date
    X = features
    y = data[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # No random splitting for time-series
    print("\nData split completed. Training samples:", len(X_train), "Testing samples:", len(X_test))

    # 6. Normalize Numerical Features
    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    print("\nScaled Feature Sample:\n", X_scaled.head())

print("\nEDA Completed Successfully.")


Column Names in the Dataset:  Index(['date', 'close', 'high', 'low', 'open', 'volume', 'ticker',
       'market_cap', 'pe_ratio', 'beta', 'eps_(earnings_per_share)',
       'forward_pe', 'revenue', 'gross_profit', 'operating_income',
       'net_income', 'debt_to_equity', 'return_on_equity_(roe)',
       'current_ratio', 'dividend_yield', 'free_cash_flow', 'profit_margin',
       'cash_ratio', 'quick_ratio', 'price_to_book_ratio', 'enterprise_value',
       'total_debt', 'total_assets', 'total_equity',
       'trailing_twelve_months_(ttm)_revenue',
       'trailing_twelve_months_(ttm)_ebitda',
       'trailing_twelve_months_(ttm)_earnings'],
      dtype='object')
Error: 'close_price_aapl' column is missing.
Available columns are:  Index(['date', 'close', 'high', 'low', 'open', 'volume', 'ticker',
       'market_cap', 'pe_ratio', 'beta', 'eps_(earnings_per_share)',
       'forward_pe', 'revenue', 'gross_profit', 'operating_income',
       'net_income', 'debt_to_equity', 'return_on_equit

# **Model Selection**

Linear Regression

XGBoost

Random Forest

Decision Trees

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("AAPL_stock_financials_cleaned.csv")

# Check if 'close' column exists
if 'close' not in data.columns:
    print("Error: 'close' column is missing.")
    print("Available columns are: ", data.columns)
else:
    target = 'close'  # Use 'close' as the target variable

    # Feature selection: Drop 'close' and 'date', ensure numerical columns are selected
    features = data.drop(columns=[target, 'date'], errors='ignore')  # Drop 'close' and 'date'

    # Filter to keep only numeric columns
    features = features.select_dtypes(include=[np.number])

    # Handle missing values by imputing with the median
    imputer = SimpleImputer(strategy='median')
    features_imputed = imputer.fit_transform(features)

    # Define X and y
    X = features_imputed
    y = data[target].dropna()  # Ensure target doesn't have missing values

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # Time series split

    # Normalize features using MinMaxScaler
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Model selection
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree Regressor": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor(),
        "Gradient Boosting Regressor": GradientBoostingRegressor()
    }

    # Train and evaluate models using cross-validation
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        model.fit(X_train_scaled, y_train)
        score = model.score(X_test_scaled, y_test)
        print(f"{model_name} R^2 score on test set: {score:.4f}")

    # Hyperparameter tuning for Random Forest using GridSearchCV
    rf_model = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print(f"\nBest parameters for Random Forest: {grid_search.best_params_}")

    # Final evaluation with the best Random Forest model
    best_rf_model = grid_search.best_estimator_
    best_rf_score = best_rf_model.score(X_test_scaled, y_test)
    print(f"Best Random Forest Model R^2 score: {best_rf_score:.4f}")


 'free_cash_flow' 'cash_ratio' 'quick_ratio' 'enterprise_value'
 'total_debt' 'total_assets' 'total_equity'
 'trailing_twelve_months_(ttm)_earnings']. At least one non-missing value is needed for imputation with strategy='median'.



Training Linear Regression...
Linear Regression R^2 score on test set: 0.9761

Training Decision Tree Regressor...
Decision Tree Regressor R^2 score on test set: 0.8902

Training Random Forest Regressor...
Random Forest Regressor R^2 score on test set: 0.9288

Training Gradient Boosting Regressor...
Gradient Boosting Regressor R^2 score on test set: 0.9417

Best parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Random Forest Model R^2 score: 0.9305


# **ML Flow Tracking**

In [None]:
pip install mlflow


Collecting mlflow
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.19.0->mlflow)
  Downloading databricks_sdk-0.40.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import mlflow
import mlflow.sklearn

# Start MLflow experiment
mlflow.start_run()

# Load the dataset
data = pd.read_csv("AAPL_stock_financials_cleaned.csv")

# Check if 'close' column exists
if 'close' not in data.columns:
    print("Error: 'close' column is missing.")
    print("Available columns are: ", data.columns)
else:
    target = 'close'  # Use 'close' as the target variable

    # Feature selection: Drop 'close' and 'date', ensure numerical columns are selected
    features = data.drop(columns=[target, 'date'], errors='ignore')  # Drop 'close' and 'date'

    # Filter to keep only numeric columns
    features = features.select_dtypes(include=[np.number])

    # Handle missing values by imputing with the median
    imputer = SimpleImputer(strategy='median')
    features_imputed = imputer.fit_transform(features)

    # Define X and y
    X = features_imputed
    y = data[target].dropna()  # Ensure target doesn't have missing values

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # Time series split

    # Normalize features using MinMaxScaler
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Model selection
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree Regressor": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor(),
        "Gradient Boosting Regressor": GradientBoostingRegressor()
    }

    # Train and evaluate models using cross-validation
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        model.fit(X_train_scaled, y_train)
        score = model.score(X_test_scaled, y_test)
        print(f"{model_name} R^2 score on test set: {score:.4f}")

        # Log metrics to MLflow
        mlflow.log_metric(f"{model_name}_R2", score)

        # Log model with input example for signature inference
        example_input = X_train_scaled[:1]  # First sample as an example input
        mlflow.sklearn.log_model(model, f"{model_name}_model", input_example=example_input)

    # Hyperparameter tuning for Random Forest using GridSearchCV
    rf_model = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print(f"\nBest parameters for Random Forest: {grid_search.best_params_}")

    # Final evaluation with the best Random Forest model
    best_rf_model = grid_search.best_estimator_
    best_rf_score = best_rf_model.score(X_test_scaled, y_test)
    print(f"Best Random Forest Model R^2 score: {best_rf_score:.4f}")

    # Log best Random Forest model
    example_input = X_train_scaled[:1]  # First sample as an example input
    mlflow.sklearn.log_model(best_rf_model, "best_rf_model", input_example=example_input)

# End MLflow experiment
mlflow.end_run()


 'free_cash_flow' 'cash_ratio' 'quick_ratio' 'enterprise_value'
 'total_debt' 'total_assets' 'total_equity'
 'trailing_twelve_months_(ttm)_earnings']. At least one non-missing value is needed for imputation with strategy='median'.



Training Linear Regression...
Linear Regression R^2 score on test set: 0.9761

Training Decision Tree Regressor...
Decision Tree Regressor R^2 score on test set: 0.8982

Training Random Forest Regressor...
Random Forest Regressor R^2 score on test set: 0.9306

Training Gradient Boosting Regressor...
Gradient Boosting Regressor R^2 score on test set: 0.9388

Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Model R^2 score: 0.9330


In [None]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64

# **Streamlit Incoporating**
Executed in Visual code by svaing in .py file and running in virtual environment

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import mlflow
import mlflow.sklearn

# Streamlit UI
st.title("Stock Price Prediction System")
st.write("Predict closing stock prices using machine learning models.")
uploaded_file = st.file_uploader("Upload your dataset (CSV)", type=["csv"])

if uploaded_file is not None:
    # Load the dataset
    data = pd.read_csv(uploaded_file)
    st.write("Preview of the Dataset:", data.head())

    if 'close' not in data.columns:
        st.error("'close' column is missing. Please upload a dataset with the 'close' column.")
    else:
        # Start MLflow experiment
        mlflow.start_run()

        # Target variable
        target = 'close'

        # Feature selection
        features = data.drop(columns=[target, 'date'], errors='ignore')
        features = features.select_dtypes(include=[np.number])

        # Handle missing values
        imputer = SimpleImputer(strategy='median')
        features_imputed = imputer.fit_transform(features)

        # Define X and y
        X = features_imputed
        y = data[target].dropna()

        # Train-Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

        # Normalize features
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Model selection
        models = {
            "Linear Regression": LinearRegression(),
            "Decision Tree Regressor": DecisionTreeRegressor(),
            "Random Forest Regressor": RandomForestRegressor(),
            "Gradient Boosting Regressor": GradientBoostingRegressor()
        }

        st.write("### Model Performance")
        results = []

        for model_name, model in models.items():
            model.fit(X_train_scaled, y_train)
            score = model.score(X_test_scaled, y_test)
            results.append({"Model": model_name, "R² Score": round(score, 4)})

            # Log metrics and models
            mlflow.log_metric(f"{model_name}_R2", score)
            example_input = X_train_scaled[:1]
            mlflow.sklearn.log_model(model, f"{model_name}_model", input_example=example_input)

        # Display results
        st.dataframe(results)

        # Hyperparameter tuning for Random Forest
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
        rf_model = RandomForestRegressor()
        grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train_scaled, y_train)

        best_rf_model = grid_search.best_estimator_
        best_rf_score = best_rf_model.score(X_test_scaled, y_test)
        st.write(f"Best Random Forest R² Score: {best_rf_score:.4f}")
        st.write("Best Parameters for Random Forest:", grid_search.best_params_)

        # Log the best model
        mlflow.sklearn.log_model(best_rf_model, "best_rf_model", input_example=X_train_scaled[:1])

        # End MLflow experiment
        mlflow.end_run()

        st.success("Experiment completed and logged to MLflow!")

else:
    st.info("Please upload a dataset to begin.")

# To run this script in Colab:
# 1. Install Streamlit and pyngrok (`!pip install streamlit pyngrok`).
# 2. Run Streamlit in the background (`!streamlit run your_script.py &`).
# 3. Use ngrok to tunnel your Streamlit app (`!ngrok http 8501`).
