In [2]:
#load and preprocess the data
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [3]:
# Replace file paths with the paths to the downloaded files
orders_df = pd.read_csv("C:/Users/lydi_/OneDrive/Documents/DTU master , lectures and exercises/Computational Tools for Data Science/olist_orders_dataset.csv")
order_items_df = pd.read_csv("C:/Users/lydi_/OneDrive/Documents/DTU master , lectures and exercises/Computational Tools for Data Science/olist_order_items_dataset.csv")
products_df = pd.read_csv("C:/Users/lydi_/OneDrive/Documents/DTU master , lectures and exercises/Computational Tools for Data Science/olist_products_dataset.csv")
reviews_df = pd.read_csv("C:/Users/lydi_/OneDrive/Documents/DTU master , lectures and exercises/Computational Tools for Data Science/olist_order_reviews_dataset.csv")


In [4]:
# Checking for missing values in each DataFrame
print(orders_df.isnull().sum())
print(order_items_df.isnull().sum())
print(products_df.isnull().sum())
print(reviews_df.isnull().sum())


order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64
review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_crea

In [5]:
# Imputing missing values in 'product_category_name' with the most frequent category (mode)
products_df['product_category_name'].fillna(products_df['product_category_name'].mode()[0], inplace=True)

# Imputing missing numerical data in product details with their respective mean values
for col in ['product_name_lenght', 'product_description_lenght', 'product_photos_qty']:
    products_df[col].fillna(products_df[col].mean(), inplace=True)

# Imputing missing values in product dimensions and weight with mean

for col in ['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']:
    products_df[col].fillna(products_df[col].mean(), inplace=True)

# Replacing missing values in review comments with a placeholder 'No Comment'
reviews_df['review_comment_title'].fillna('No Comment', inplace=True)
reviews_df['review_comment_message'].fillna('No Comment', inplace=True)


In [6]:
# Rechecking for missing values in Product and Review DataSets after imputation
print("Missing values in Product Dataset:")
print(products_df.isnull().sum())

print("\nMissing values in Review Dataset:")
print(reviews_df.isnull().sum())



Missing values in Product Dataset:
product_id                    0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              0
product_length_cm             0
product_height_cm             0
product_width_cm              0
dtype: int64

Missing values in Review Dataset:
review_id                  0
order_id                   0
review_score               0
review_comment_title       0
review_comment_message     0
review_creation_date       0
review_answer_timestamp    0
dtype: int64


In [7]:
# Merging the orders, items, and products datasets into a single DataFrame

full_order_df = pd.merge(orders_df, order_items_df, on='order_id', how='left')
full_order_product_df = pd.merge(full_order_df, products_df, on='product_id', how='left')

In [8]:
# Creating new feature 'total_value' by adding price and freight value
#based on the merged data, we want to create new features that can help in our analysis or model building
full_order_product_df['total_value'] = full_order_product_df['price'] + full_order_product_df['freight_value']

# Converting 'order_purchase_timestamp' from string to datetime and extracting weekday and hour
full_order_product_df['order_purchase_timestamp'] = pd.to_datetime(full_order_product_df['order_purchase_timestamp'])
full_order_product_df['purchase_weekday'] = full_order_product_df['order_purchase_timestamp'].dt.day_name()
full_order_product_df['purchase_hour'] = full_order_product_df['order_purchase_timestamp'].dt.hour


In [9]:
#after we create the new features we do the final check for missing values and data types
# Checking for missing values and verifying data types in the merged DataFrame
print(full_order_product_df.isnull().sum())

# Check data types
print(full_order_product_df.dtypes)

# Saving the processed DataFrame to a CSV file for future usee
full_order_product_df.to_csv('processed_data.csv', index=False)
    

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 161
order_delivered_carrier_date     1968
order_delivered_customer_date    3229
order_estimated_delivery_date       0
order_item_id                     775
product_id                        775
seller_id                         775
shipping_limit_date               775
price                             775
freight_value                     775
product_category_name             775
product_name_lenght               775
product_description_lenght        775
product_photos_qty                775
product_weight_g                  775
product_length_cm                 775
product_height_cm                 775
product_width_cm                  775
total_value                       775
purchase_weekday                    0
purchase_hour                       0
dtype: int64
order_id                             

In [10]:
# Handling missing values
# Dropping rows where order items are missing
full_order_product_df.dropna(subset=['order_item_id'], inplace=True)

# Filling missing dates with placeholder or imputation
full_order_product_df['order_approved_at'].fillna(method='ffill', inplace=True)  # Example: forward fill

# Converting date columns to datetime
full_order_product_df['order_approved_at'] = pd.to_datetime(full_order_product_df['order_approved_at'])

# Save the processed dataframe
full_order_product_df.to_csv('processed_data.csv', index=False)


#### A simple recommender system based on previously ordered product categories

This code defines a simple recommender system that suggests new products to a customer based on the categories of products they have previously ordered. It uses the merged dataset to find the categories of products a specific customer has ordered and then recommends different products from these same categories. The function recommend_products can be used to get a list of recommendations for any customer in the dataset.

In [11]:
# Merging datasets to get the user, product, and category information together
merged_df = pd.merge(orders_df, order_items_df, on='order_id')
merged_df = pd.merge(merged_df, products_df, on='product_id')

In [12]:
def recommend_products(customer_id, num_recommendations=5):       
     # This function recommends products to a customer based on their previous orders.

     # Find products previously ordered by the customer by filtering the merged DataFrame.
     # It selects the 'product_category_name' for orders where 'customer_id' matches the given customer_id.
    
     # Find products previously ordered by the customer
    ordered_products = merged_df[merged_df['customer_id'] == customer_id]['product_category_name']
    # Recommend other products in the same categories.
    # This is done by selecting entries from the merged DataFrame where 'product_category_name' is one of the categories in 'ordered_products'.
    # It then selects the 'product_id' of these products.
    
    
    # Returning the unique product IDs of the recommendations, limited to the number specified by 'num_recommendations'.
    recommendations = merged_df[merged_df['product_category_name'].isin(ordered_products)]
    return recommendations['product_id'].unique()[:num_recommendations]

This output represents the recommendation system's suggestions for a customer based on their previous purchase history. Specifically, it recommends products that are in the same categories as those previously ordered by the customer.

So, when we call recommend_products(merged_df['customer_id'][0]), it's returning the top 5 recommended product IDs for the first customer in your merged dataset, based on the categories of products they have bought before. We can do it for other customers also , not only for the first one. 

In [23]:
# Example usage of the function: recommend products to the first customer in the merged dataset.
recommend_products(merged_df['customer_id'][0])

array(['595fac2a385ac33a80bd5114aec74eb8',
       '72a97c271b2e429974398f46b93ae530',
       '009c09f439988bc06a93d6b8186dce73',
       '00baba5b58e274d0332a0c8a0a66f877',
       'c6c1f263e076bd9c1f1640250a5d0c29'], dtype=object)

#### Recommendations based on Cosine Similarity  

This code creates a recommendation system based on item similarity using cosine similarity. It first processes the data to create a user-item rating matrix, computes the item similarity matrix, and then defines a function to recommend similar products for a given product ID. The function selects the top N similar products based on their cosine similarity scores.

In [24]:
# Merge Review Scores with the full order product dataframe
merged_df = pd.merge(full_order_product_df, reviews_df[['order_id', 'review_score']], on='order_id', how='left')

# Create a 'rating' column in merged_df. Missing review scores are filled with the average score
# Handling missing values - fill with average score or a predefined score
default_rating = merged_df['review_score'].mean()
merged_df['rating'] = merged_df['review_score'].fillna(default_rating)


# This line seems redundant as the merge operation and rating creation are repeated from the previous steps
merged_df = pd.merge(full_order_product_df, reviews_df[['order_id', 'review_score']], on='order_id', how='left')

# Repeating the creation of the 'rating' column with missing values filled with the average review score
default_rating = merged_df['review_score'].mean()
merged_df['rating'] = merged_df['review_score'].fillna(default_rating)

# Selecting the top 2000 products based on the number of orders
top_products = merged_df['product_id'].value_counts().head(2000).index
filtered_df = merged_df[merged_df['product_id'].isin(top_products)]




In [15]:
# Creating a pivot table with 'customer_id' as rows, 'product_id' as columns, and 'rating' as values, filling missing values with 0
pivot_table = filtered_df.pivot_table(index='customer_id', columns='product_id', values='rating').fillna(0)
pivot_table_sparse = csr_matrix(pivot_table)

In [16]:
# Computing cosine similarity between items using the sparse matrix of the pivot table
item_similarity = cosine_similarity(pivot_table_sparse.T, dense_output=False)

# Creating a mapping from product IDs to internal indices used in the similarity matrix
unique_product_ids = filtered_df['product_id'].unique()
product_id_mapping = pd.Series(index=unique_product_ids, data=range(len(unique_product_ids)))

In [17]:
def recommend_products_cosine(product_id, top_n=5):
    # Function to recommend products based on cosine similarity

    # Check if the product ID exists in the mapping, raise an error if not found
    if product_id not in product_id_mapping.index:
        raise ValueError(f"Product ID '{product_id}' not found in the mapping.")
    
    # Find the internal index of the product in the similarity matrix
    product_idx = product_id_mapping[product_id_mapping.index == product_id].iloc[0]
    
    print("Product ID:", product_id)
    print("Internal Index:", product_idx)
    
    # Get similarity values for the product
    similarity_values = item_similarity[product_idx].toarray().flatten()
    
    # Get indices of top similar products, excluding the product itself
    similar_product_indices = similarity_values.argsort()[::-1][1:top_n+1]  # Exclude the product itself
    
    # Convert these indices back to product IDs
    similar_products = product_id_mapping.iloc[similar_product_indices].index
    
    return similar_products

We call the function with the randomly selected product ID. This function is designed to find products similar to given product_ID.

In the outpu the product_ID selected randomly for which recommendations are being generated. It's a unique identifier for a product in our dataset. The function then uses this internal indec to find the cosine similarity scores of this product with all other products. It sorts these scores to find the top 'N' similar products, which are then returned as recommendations.

The recommendations variable will hold the product ID of the top N similar products. These are the products most similar to the randomly chocen one, as determined by the cosine similarity in their features (in this case, based on customer ratings). 


This output is part of the testing process to ensure that your recommendation system is functioning as expected. By using a random product Id , we can simulate how can the system would operate in a real-world scenario, recommending products similar to any given product. 

In [25]:
# Testing the function with a random product ID from the filtered dataset
random_product_id = filtered_df['product_id'].sample().iloc[0]
recommendations = recommend_products_cosine(random_product_id)

Product ID: 810e2944bca9850b934e1570ba372e7d
Internal Index: 1318


#### Recommendations based on Collaborative Filtering

This code implements a collaborative filtering recommendation system. It first creates a sparse matrix to represent customer-product interactions and computes item similarity based on these interactions. The recommend_products_collaborative function then recommends products for a given customer ID based on these interactions and similarity scores, excluding products that the customer has already purchased. The test case at the end demonstrates how the function can be used to generate recommendations for a random customer from your dataset.


In [19]:
# Create a sparse matrix for customer-product interactions
customer_ids = pd.factorize(filtered_df['customer_id'])[0]
product_ids = pd.factorize(filtered_df['product_id'])[0]
ratings = filtered_df['rating'].values


# Creating a sparse matrix with customers as rows, products as columns, and ratings as values
pivot_table_sparse = csr_matrix((ratings, (customer_ids, product_ids)), shape=(len(np.unique(customer_ids)), len(np.unique(product_ids))))

# Create a mapping from customer and product IDs to internal indices
customer_id_mapping = pd.Series(index=filtered_df['customer_id'].unique(), data=np.unique(customer_ids))
product_id_mapping = pd.Series(index=filtered_df['product_id'].unique(), data=np.unique(product_ids))

# Compute cosine similarity between items based on the sparse matrix
item_similarity = cosine_similarity(pivot_table_sparse.T, dense_output=False)

In [20]:
def recommend_products_collaborative(customer_id, top_n=5):
    # Function to recommend products to a customer based on collaborative filtering

    # Check if the customer ID exists in the mapping, raise an error if not found
    if customer_id not in customer_id_mapping.index:
    if customer_id not in customer_id_mapping.index:
        raise ValueError(f"Customer ID '{customer_id}' not found in the mapping.")

    # Find the internal index of the customer in the similarity matrix
    customer_idx = customer_id_mapping[customer_id]

    # Get ratings (interactions) for the customer
    customer_ratings = pivot_table_sparse[customer_idx].reshape(1, -1)

    print("Dimensions of item_similarity:", item_similarity.shape)
    print("Dimensions of customer_ratings:", customer_ratings.shape)

    # Compute the weighted sum of similarity values for all items
    weighted_sum = item_similarity.dot(customer_ratings.T)

    # Get indices of products already purchased by the customer
    purchased_product_indices = product_ids[customer_ratings.nonzero()[1]]

    # Exclude already purchased products from the recommendations by setting their scores to 0
    weighted_sum[purchased_product_indices] = 0
    print(weighted_sum)
    print('--------')
    print(top_n)

    
    # Determine the top N products based on the similarity scores
    top_n = min(top_n, weighted_sum.shape[0])
    kth = min(top_n, weighted_sum.shape[0]) - 1
    kth = max(0, kth)  # Ensure kth is not negative
    
    # Select the top N products, handle cases where kth might be out of bounds
    if kth < len(weighted_sum.data):
        top_product_indices = np.argpartition(weighted_sum.data, kth)[:kth + 1]
        top_products = product_id_mapping.iloc[weighted_sum.indices[top_product_indices]].index
    else:
        # Handle the case when kth is out of bounds
        top_products = product_id_mapping.index
    
    return top_products[:top_n]

In the test of our collaborative filtering recommendation system, a random customer ID was selected from the dataset to simulate a real-world scenario. The recommend_products_collaborative function generated personalized product recommendations based on this customer's previous interactions and the preferences of similar customers. The output included the dimensions of the similarity matrices and a list of five recommended product IDs. This brief test demonstrated the system's ability to leverage user-item interactions to provide tailored recommendations, validating its functionality and effectiveness.

In [22]:
# Testing the function with a random customer ID from the filtered dataset

random_customer_id = filtered_df['customer_id'].sample().iloc[0]
recommendations_collaborative = recommend_products_collaborative(random_customer_id)
print("Collaborative Filtering Recommendations:", recommendations_collaborative)

Dimensions of item_similarity: (2000, 2000)
Dimensions of customer_ratings: (1, 2000)
  (741, 0)	0.0
  (927, 0)	1.0324840195920109
  (1287, 0)	30.000000000000007
--------
5
Collaborative Filtering Recommendations: Index(['595fac2a385ac33a80bd5114aec74eb8', '65266b2da20d04dbe00c5c2d3bb7859e',
       '060cb19345d90064d1015407193c233d', '4520766ec412348b8d4caa5e8a18c464',
       '08574b074924071f4e201e151b152b4e'],
      dtype='object')
