In [None]:
import pandas as pd

customers = pd.read_csv('/content/customers.csv')
orders = pd.read_csv('/content/order_items.csv')
order_items = pd.read_csv('/content/orders.csv')
products = pd.read_csv('/content/products.csv')

In [None]:
# Merge orders with order_items
df = orders.merge(order_items, on='order_id', how='inner')

# Merge with products to get product names
df = df.merge(products[['product_id', 'product_name']], on='product_id', how='inner')

# Standardize all column names to lowercase and remove spaces
customers.columns = customers.columns.str.lower().str.replace(' ', '_')

# Now the column is 'customer_id'
df = df.merge(customers[['customer_id', 'first_name', 'last_name']], on='customer_id', how='inner')

# Create a full name column
df['customer_name'] = df['first_name'] + ' ' + df['last_name']


In [None]:
user_product_matrix = df.pivot_table(
    index='customer_id',
    columns='product_name',
    values='quantity',
    aggfunc='sum',
    fill_value=0
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between products
product_similarity = cosine_similarity(user_product_matrix.T)  # Transpose so products are compared

# Convert to a DataFrame
similarity_df = pd.DataFrame(product_similarity,
                             index=user_product_matrix.columns,
                             columns=user_product_matrix.columns)


In [None]:
def recommend_products(product_name, n=5):
    similar_scores = similarity_df[product_name].sort_values(ascending=False)
    return similar_scores[1:n+1]  # Skip the product itself

In [None]:
recommend_products("Apple AirPods 3rd Gen")

Unnamed: 0_level_0,Apple AirPods 3rd Gen
product_name,Unnamed: 1_level_1
Apple iMac 27-Inch Retina,0.36851
Apple iPad Air (5th Gen),0.353426
Stuffed Animal Set,0.352159
Dog Bed with Canopy,0.350716
Children’s Puzzle,0.349693


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Convert to basket format (0/1 matrix)
basket = df.groupby(['order_id', 'product_name'])['quantity'].sum().unstack().fillna(0)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

frequent_itemsets = apriori(basket, min_support=0.002, use_colnames=True) # Increased min_support
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.5)

  basket = basket.applymap(lambda x: 1 if x > 0 else 0)


In [None]:
print(frequent_itemsets.head(10))

    support                            itemsets
0  0.002034               (ASUS TUF Gaming A15)
1  0.003514                (Action Figures Set)
2  0.003098                   (Activity Center)
3  0.004993             (Apple AirPods 3rd Gen)
4  0.005826                 (Apple AirPods Max)
5  0.005594                 (Apple AirPods Pro)
6  0.006010         (Apple AirPods Pro 2nd Gen)
7  0.005687  (Apple AirPods Pro 2nd Generation)
8  0.002635            (Apple MacBook Air 2020)
9  0.002820              (Apple MacBook Air M1)


In [None]:
print(rules.head(10))

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

# Assume you already have user_product_matrix (users as rows, products as columns)

# 1️⃣ Convert to float before masking
matrix = user_product_matrix.values.astype(float)  # change int → float

# 2️⃣ Create mask for test set
np.random.seed(42)
test_mask = np.random.rand(*matrix.shape) < 0.2  # 20% for testing

# 3️⃣ Build train and test matrices
train_matrix = matrix.copy()
train_matrix[test_mask] = np.nan  # hide test values in training set

test_matrix = matrix.copy()
test_matrix[~test_mask] = np.nan  # hide train values in testing set

# 4️⃣ Fill NaNs with 0 for SVD
train_filled = np.nan_to_num(train_matrix, nan=0.0)

# 5️⃣ Train SVD on train data
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(train_filled)
item_factors = svd.components_.T

# 6️⃣ Predict ratings for all users/products
predicted_ratings = np.dot(user_factors, item_factors.T)
pred_df = pd.DataFrame(predicted_ratings,
                       index=user_product_matrix.index,
                       columns=user_product_matrix.columns)

# 7️⃣ Evaluate only on test entries (the ones we hid)
actual_values = test_matrix.flatten()
predicted_values = pred_df.values.flatten()

mask = ~np.isnan(actual_values)  # only where we have real test values

rmse = np.sqrt(mean_squared_error(actual_values[mask], predicted_values[mask]))
mae = mean_absolute_error(actual_values[mask], predicted_values[mask])

print(f"RMSE on Test Set: {rmse:.4f}")
print(f"MAE on Test Set: {mae:.4f}")


RMSE on Test Set: 0.5147
MAE on Test Set: 0.1403


In [None]:
import pandas as pd

# Aggregate purchase history per customer-product
ratings_df = df.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',         # total items bought
    'price_per_unit': 'sum'       # total money spent
}).reset_index()

# Let's use total quantity as the "rating"
ratings_df.rename(columns={'quantity': 'rating'}, inplace=True)

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.getOrCreate()

# Convert pandas to Spark DataFrame
spark_df = spark.createDataFrame(ratings_df[['customer_id', 'product_id', 'rating']])

# Train/Test split
(train, test) = spark_df.randomSplit([0.8, 0.2], seed=42)

# ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="customer_id",
    itemCol="product_id",
    ratingCol="rating",
    coldStartStrategy="drop"
)

model = als.fit(train)

# Evaluate using RMSE
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")
print(f"MAE on Test Set: {mae}")

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


Root-mean-square error = 1.4390401290538202
MAE on Test Set: 0.14028990124802465


# New Section