In [1]:
# app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# --------------------------------
# 1. Load & prepare data + model
# --------------------------------
@st.cache_data
def load_data():
    s = pd.read_csv("social_media_usage.csv")

    def clean_sm(x):
        return np.where(x == 1, 1, 0)

    ss = pd.DataFrame({
        "sm_li": clean_sm(s["web1h"]),   # LinkedIn user (1=yes, 0=no)
        "income": np.where((s["income"] >= 1) & (s["income"] <= 9), s["income"], np.nan),
        "education": np.where((s["educ2"] >= 1) & (s["educ2"] <= 8), s["educ2"], np.nan),
        "parent": clean_sm(s["par"]),
        "married": clean_sm(s["marital"]),
        "female": clean_sm(s["gender"]),
        "age": np.where((s["age"] >= 1) & (s["age"] <= 97), s["age"], np.nan)
    })

    ss = ss.dropna()

    X = ss[["income", "education", "parent", "married", "female", "age"]]
    y = ss["sm_li"]

    return ss, X, y


@st.cache_resource
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

    lr = LogisticRegression(class_weight="balanced", max_iter=1000)
    lr.fit(X_train, y_train)

    accuracy = lr.score(X_test, y_test)
    return lr, accuracy


# Load data and train model
ss, X, y = load_data()
model, test_accuracy = train_model(X, y)

# --------------------------------
# 2. Streamlit UI
# --------------------------------
st.title("LinkedIn Usage Prediction App")
st.write(
    "This app predicts whether a person is likely to use LinkedIn, "
    "based on basic demographic and household characteristics."
)

st.markdown(f"**Model test accuracy:** {test_accuracy:.3f}")

st.header("Input Profile")

income = st.slider(
    "Household income category (1 = lowest, 9 = highest)",
    min_value=1, max_value=9, value=8
)

education = st.slider(
    "Education level (1 = less than HS, 8 = postgraduate)",
    min_value=1, max_value=8, value=7
)

parent_str = st.radio("Parent of child under 18 at home?", ["No", "Yes"])
parent = 1 if parent_str == "Yes" else 0

married_str = st.radio("Marital status", ["Not married / other", "Married"])
married = 1 if married_str == "Married" else 0

female_str = st.radio("Gender", ["Male", "Female"])
female = 1 if female_str == "Female" else 0

age = st.slider("Age", min_value=18, max_value=90, value=42)

if st.button("Predict LinkedIn Usage"):
    person = pd.DataFrame({
        "income": [income],
        "education": [education],
        "parent": [parent],
        "married": [married],
        "female": [female],
        "age": [age]
    })

    prob_li = model.predict_proba(person)[0][1]
    pred_class = model.predict(person)[0]

    st.subheader("Prediction Result")
    st.write(f"**Predicted LinkedIn user?** {'Yes' if pred_class == 1 else 'No'}")
    st.write(f"**Estimated probability of LinkedIn use:** {prob_li:.3f}")

    st.caption(
        "Note: This prediction is based on survey data and is intended for exploratory, "
        "marketing-oriented analysis rather than individual-level decision-making."
    )


2025-12-07 18:02:00.192 
  command:

    streamlit run C:\Users\conal\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


FileNotFoundError: [Errno 2] No such file or directory: 'social_media_usage.csv'