<a href="https://colab.research.google.com/github/devarshee-13/ML-Tasks/blob/main/Copy_of_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit pandas scikit-learn matplotlib seaborn
!pip install streamlit pyngrok
!pip install streamlit pyngrok --quiet




In [None]:
import pandas as pd

data = {
    'Ticker': ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'META'],
    'PE': [28, 32, 24, 61, 21],
    'PB': [34, 14, 6, 9, 6],
    'PEG': [1.9, 2.3, 1.6, 3.1, 1.2],
    'EV_EBITDA': [20, 18, 15, 30, 12],
    'ROE': [38, 40, 25, 12, 33],
    'OperatingCashFlowMargin': [0.34, 0.45, 0.38, 0.29, 0.4],
    'DebtToEquity': [1.5, 0.9, 0.4, 1.2, 0.6],
    'Volatility': [0.25, 0.2, 0.3, 0.35, 0.28]
}

sample_df = pd.DataFrame(data)
sample_df.to_csv("sample_stocks.csv", index=False)


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# --- Sidebar Controls ---
st.sidebar.title("Clustering Settings")
qrr_weight = st.sidebar.slider("QRR Importance", min_value=0.0, max_value=5.0, value=1.0, step=0.1)
k = st.sidebar.slider("Number of Clusters (K)", min_value=2, max_value=10, value=3)

# --- Load Data ---
st.title("📊 Stock Clustering with Dynamic QRR")
uploaded_file = st.file_uploader("Upload your stock data CSV", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

    required_cols = ['Ticker', 'PE', 'PB', 'PEG', 'EV_EBITDA', 'ROE', 'OperatingCashFlowMargin', 'DebtToEquity', 'Volatility']
    if all(col in df.columns for col in required_cols):

        # --- Compute advanced QRR ---
        epsilon = 1e-6
        df['QRR'] = (df['ROE'] + df['OperatingCashFlowMargin']) / (df['DebtToEquity'] + df['Volatility'] + epsilon)
        df['QRR_weighted'] = df['QRR'] * qrr_weight

        # --- Clustering ---
        features = ['PE', 'PB', 'PEG', 'EV_EBITDA', 'QRR_weighted']
        X = df[features]
        X_scaled = StandardScaler().fit_transform(X)

        kmeans = KMeans(n_clusters=k, random_state=42)
        df['Cluster'] = kmeans.fit_predict(X_scaled)

        # --- PCA for 2D Visualization ---
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_scaled)
        df['PCA1'] = X_pca[:, 0]
        df['PCA2'] = X_pca[:, 1]

        # --- Plotting ---
        st.subheader("PCA Visualization of Clusters")
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2', s=100, ax=ax)

        for i in range(len(df)):
            ax.text(df['PCA1'][i] + 0.03, df['PCA2'][i] + 0.03, df['Ticker'][i], fontsize=8)

        plt.title(f"K-Means Clustering with QRR Weight = {qrr_weight}")
        plt.xlabel("PCA Component 1")
        plt.ylabel("PCA Component 2")
        plt.grid(True)
        st.pyplot(fig)

        # --- Cluster Summary Table ---
        st.subheader("Cluster Summary")
        st.dataframe(df.groupby("Cluster")[features].mean().round(2))

        # --- Full Data ---
        st.subheader("Full Data with Clusters")
        st.dataframe(df[['Ticker', 'Cluster'] + features])

    else:
        st.error(f"CSV must contain the following columns:\n{', '.join(required_cols)}")
else:
    st.info("Please upload a CSV file to begin.")


Overwriting app.py


In [None]:
!ngrok config add-authtoken 2vU9n9oSBWwlFqggTbiAmkNdt1g_7bgDi1en2bdDkrJuu44Bh


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
ngrok.kill()


In [None]:
!streamlit run app.py &>/content/logs.txt &

In [None]:
public_url = ngrok.connect(addr="8501", proto="http")  # Fixes the error
print(f"🌍 Your Streamlit app is live at: {public_url}")


🌍 Your Streamlit app is live at: NgrokTunnel: "https://89bb-34-56-169-6.ngrok-free.app" -> "http://localhost:8501"


In [None]:

# 🚀 Stock Clustering with Portfolio Preferences (Streamlit + KMeans + PCA)
# 📌 Install dependencies
!pip install streamlit pyngrok scikit-learn pandas matplotlib seaborn --quiet

# 📁 Upload your CSV file
import pandas as pd
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

# ⚙️ Feature Engineering
from sklearn.preprocessing import StandardScaler
import numpy as np

# Avoid divide by zero
ε = 1e-6

df['ValuationScore'] = 1 / (df[['PE', 'PB', 'PEG', 'EV_EBITDA']].mean(axis=1) + ε)
df['ProfitabilityScore'] = df['ROE'] + df['OperatingCashFlowMargin']
df['LeverageScore'] = 1 / (df['DebtToEquity'] + ε)
df['VolatilityScore'] = 1 / (df['Volatility'] + ε)
df['GrowthScore'] = 1 / (df['PEG'] + ε)

# Normalize scores
features = ['ValuationScore', 'ProfitabilityScore', 'LeverageScore', 'VolatilityScore', 'GrowthScore']
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)

# Save processed file
df_scaled['Ticker'] = df['Ticker']
df_scaled.to_csv("processed_stocks.csv", index=False)

# 💻 Create Streamlit App
with open("app.py", "w") as f:
    f.write('''
import streamlit as st
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("processed_stocks.csv")

# Sidebar sliders
st.sidebar.header("Portfolio Preference Weights")
w_val = st.sidebar.slider("Valuation", 0.0, 1.0, 0.5)
w_profit = st.sidebar.slider("Profitability", 0.0, 1.0, 0.5)
w_lev = st.sidebar.slider("Leverage", 0.0, 1.0, 0.5)
w_vol = st.sidebar.slider("Volatility", 0.0, 1.0, 0.5)
w_growth = st.sidebar.slider("Growth Potential", 0.0, 1.0, 0.5)

# Weighted score
df['CustomScore'] = (
    w_val * df['ValuationScore'] +
    w_profit * df['ProfitabilityScore'] +
    w_lev * df['LeverageScore'] +
    w_vol * df['VolatilityScore'] +
    w_growth * df['GrowthScore']
)

# Clustering
features = ['ValuationScore', 'ProfitabilityScore', 'LeverageScore', 'VolatilityScore', 'GrowthScore']
k = st.sidebar.slider("Number of Clusters", 2, 6, 3)
model = KMeans(n_clusters=k, random_state=42)
df['Cluster'] = model.fit_predict(df[features])

# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df[features])
df['PCA1'], df['PCA2'] = df_pca[:, 0], df_pca[:, 1]

# Plot
fig, ax = plt.subplots()
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Cluster', palette='tab10', ax=ax)
for i in range(len(df)):
    ax.text(df['PCA1'][i], df['PCA2'][i], df['Ticker'][i], fontsize=8)
st.pyplot(fig)

# Show tables
st.subheader("Cluster Summary")
st.dataframe(df.groupby('Cluster')[features + ['CustomScore']].mean().round(2))

st.subheader("Stock List with Cluster")
st.dataframe(df[['Ticker', 'Cluster', 'CustomScore'] + features].sort_values('Cluster'))
''')

# 🌐 Ngrok setup to run Streamlit
from pyngrok import ngrok
!pkill streamlit
public_url = ngrok.connect(port="8501", proto="http")
!streamlit run app.py &> logs.txt &

public_url


Saving sample_stocks.csv to sample_stocks (1).csv




PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}
