# Segment 2 Lab 2

## A real world case study

We will look at prices of actual products scraped from Amazon

We have details of the products, along with key features.

We'll first examine the data, then we'll run Regression

In [None]:
# imports

import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
from tqdm import tqdm
import pickle
import json

In [None]:
# Load in dataset

with open('../training_data.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test_data.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
items = train + test

In [None]:
categories = list(set(item.category for item in items))
counts = [len([item for item in items if item.category==category]) for category in categories]

In [None]:

# Bar chart by category
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="goldenrod")
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')

plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

# Display the chart
plt.show()

In [None]:
# Plot the distribution of prices

prices = [item.price for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.1f} and highest {max(prices):,}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="purple", bins=range(0, 1000, 10))
plt.show()

In [None]:
# Plot the distribution of weights

weights = [item.weight for item in items]
plt.figure(figsize=(15, 6))
plt.title(f"Weight (ounces)")
plt.xlabel('Weight (ounces)')
plt.ylabel('Count')
plt.hist(weights, rwidth=0.7, color="skyblue", bins=range(0, 1000, 20))
plt.show()

In [None]:
# How does the price vary with the weight

weights = [item.weight for item in items]
prices = [item.price for item in items]

# Create the scatter plot
plt.figure(figsize=(15, 8))
plt.scatter(weights, prices, s=0.2, color="red")
plt.xlim(0, 2000)
plt.ylim(0, 1000)

# Add labels and title
plt.xlabel('Weight')
plt.ylabel('Price')
plt.title('Investigate correlations')

# Display the plot
plt.show()

In [None]:
# How does the price vary with the weight

ranks = [item.rank for item in items]
prices = [item.price for item in items]

# Create the scatter plot
plt.figure(figsize=(15, 8))
plt.scatter(ranks, prices, s=0.2, color="green")
plt.xlim(0, 10000)
plt.ylim(0, 1000)

# Add labels and title
plt.xlabel('Rank')
plt.ylabel('Price')
plt.title('Investigate correlations')

# Display the plot
plt.show()

In [None]:
# How does the price vary with the timestamo

when = [item.timestamp for item in items]
prices = [item.price for item in items]

# Create the scatter plot
plt.figure(figsize=(15, 8))
plt.scatter(when, prices, s=0.2, color="orange")
plt.ylim(0, 1000)
plt.xlim(0, 2e9)

# Add labels and title
plt.xlabel('When')
plt.ylabel('Price')
plt.title('Investigate correlations')

# Display the plot
plt.show()

In [None]:
# Imports for machine learning

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from testing import Tester

In [None]:
def guess(item):
    return random.randrange(1,1000)

In [None]:
random.seed(42)

In [None]:
Tester.test(guess, test)

In [None]:
train_prices = [t.price for t in train]
train_average = sum(train_prices)/len(train_prices)

def guess2(item):
    return train_average

In [None]:
Tester.test(guess2, test)

In [None]:
# Now let's do linear regression with our features

def get_features(item):
    return {
        "weight": item.weight,
        "rank": item.rank,
        "timestamp": item.timestamp,
        "is_top_tech": 1 if item.is_top_tech else 0,
        "is_top_toys": 1 if item.is_top_toys else 0,
        "price": item.price
    }

def list_to_dataframe(items):
    features = [get_features(item) for item in items]
    df = pd.DataFrame(features)
    df['price'] = [item.price for item in items]
    return df

train_df = list_to_dataframe(train)
test_df = list_to_dataframe(test[:250])

In [None]:
# Traditional Linear Regression!

np.random.seed(42)

# Separate features and target
feature_columns = ['weight', 'rank', 'timestamp', 'is_top_tech', 'is_top_toys']

X_train = train_df[feature_columns]
y_train = train_df['price']
X_test = test_df[feature_columns]
y_test = test_df['price']

# Train a Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

for feature, coef in zip(feature_columns, model.coef_):
    print(f"{feature}: {coef}")
print(f"Intercept: {model.intercept_}")

# Predict the test set and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

In [None]:
# Function to predict price for a new item

def linear_regression_pricer(item):
    features = get_features(item)
    del features["price"]
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

In [None]:
# test it

Tester.test(linear_regression_pricer, test)

In [None]:
# For the next few models, we prepare our documents and prices
# Note that we use the test prompt for the documents, otherwise we'll reveal the answer!!

def description(item):
    prompt = item.test_prompt()
    return prompt.replace('How much does this cost to the nearest dollar?\n\n', '').replace('\n\nPrice is $', '')

prices = np.array([float(item.price) for item in train])
documents = [description(item) for item in train]

In [None]:
documents[0]

In [None]:
# Use the CountVectorizer for a Bag of Words model

np.random.seed(42)
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(documents)

In [None]:
regressor = LinearRegression()
regressor.fit(X, prices)

In [None]:
def bag_of_words(item):
    x = vectorizer.transform([description(item)])
    return max(regressor.predict(x)[0], 0)

In [None]:
Tester.test(bag_of_words, test)

In [None]:
# And the powerful Random Forest regression

subset=20_000
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=8)
rf_model.fit(X[:subset], prices[:subset])

In [None]:
def random_forest(item):
    x = vectorizer.transform([description(item)])
    return max(0, rf_model.predict(x)[0])

In [None]:
Tester.test(random_forest, test)