In [1]:
# Run this cell once to install all necessary libraries for this notebook
!pip install pandas numpy scikit-learn joblib matplotlib seaborn streamlit jupyterlab


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\91797\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [None]:
# --- Definitive Setup Cell ---
# This code finds the exact Python environment this notebook is using and installs libraries there.
import sys
import subprocess

# List of packages to install
packages = [
    "pandas",
    "numpy",
    "scikit-learn",
    "joblib",
    "matplotlib",
    "seaborn",
    "streamlit",
    "jupyterlab"
]

# Get the exact path to the python interpreter the notebook is using
python_executable = sys.executable
print(f"Installing packages into: {python_executable}")

# Install each package
for package in packages:
    try:
        subprocess.check_call([python_executable, "-m", "pip", "install", package])
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package}: {e}")

print("\nInstallation complete. PLEASE RESTART THE KERNEL NOW.")

Installing packages into: c:\Users\91797\AppData\Local\Programs\Python\Python312\python.exe


In [None]:
# ==============================================================================
# ShopperSight - Data Analysis and Modeling Notebook
# ==============================================================================

# --- Imports ---
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# --- Configuration ---
# Your specific file path is included here
file_path = r'C:\Users\91797\Downloads\ShopperSight\data\events.csv'

# Define where to save the models
models_dir = 'saved_models'
# Create the directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)


# ==============================================================================
# Step 1 & 2: Load, Clean, and Prepare Data
# ==============================================================================
print("--- [Step 1 & 2] Loading and Cleaning Data ---")
try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'. Please check the path and folder structure.")
    # Stop execution if the file isn't found
    exit()


df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
df['purchased'] = np.where(df['event'] == 'transaction', 1, 0)
print("Data cleaning and feature engineering complete.\n")


# ==============================================================================
# Step 3: Purchase Prediction Model (Random Forest Classifier)
# ==============================================================================
print("--- [Step 3] Building Purchase Prediction Model ---")
features = ['visitorid', 'itemid']
target = 'purchased'

purchase_events = df[df['purchased'] == 1]
non_purchase_events = df[df['purchased'] == 0]
non_purchase_sample = non_purchase_events.sample(n=len(purchase_events), random_state=42)
balanced_df = pd.concat([purchase_events, non_purchase_sample])

X_balanced = balanced_df[features]
y_balanced = balanced_df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print("Training the Random Forest model... (This may take a few minutes)")
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete!\n")

print("--- Evaluating model performance ---")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%\n")
print("--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Not Purchased (0)', 'Purchased (1)']))


# ==============================================================================
# Step 4: Customer Segmentation (K-Means Clustering)
# ==============================================================================
print("\n--- [Step 4] Segmenting Customers with K-Means ---")
customer_df = df.groupby('visitorid').agg(
    total_views=('event', lambda x: (x == 'view').sum()),
    total_addtocart=('event',lambda x: (x == 'addtocart').sum()),
    total_transactions=('event', lambda x: (x == 'transaction').sum())
).reset_index()

# Make a copy to avoid SettingWithCopyWarning
customer_df_purchasers = customer_df[customer_df['total_transactions'] > 0].copy()

features_to_scale = ['total_views', 'total_addtocart', 'total_transactions']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_df_purchasers[features_to_scale])

optimal_k = 4
# Set n_init=10 for compatibility with older scikit-learn versions
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
# Use .loc to ensure we are modifying the original DataFrame slice
customer_df_purchasers.loc[:, 'cluster'] = kmeans.fit_predict(scaled_features)
print(f"Customers successfully segmented into {optimal_k} clusters.\n")


# ==============================================================================
# Step 5: Save All Final Assets
# ==============================================================================
print("--- [Step 5] Saving final models and data assets ---")
joblib.dump(model, os.path.join(models_dir, 'purchase_model.pkl'))
joblib.dump(kmeans, os.path.join(models_dir, 'kmeans_model.pkl'))
joblib.dump(scaler, os.path.join(models_dir, 'scaler.pkl'))
customer_df_purchasers.to_csv('customer_segments.csv', index=False)

print("All assets have been saved successfully!")
print("--- Project Notebook Execution Complete ---")

SyntaxError: incomplete input (3662303167.py, line 89)