## Notebook of Testing three different models to see which worked best with my product recommender machine.

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Decision Tree

In [2]:
productsreviews_cleaned_filled = pd.read_csv('productsreviews_cleaned_filled.csv')

# Display the first few rows of the DataFrame to ensure it's loaded correctly
print(productsreviews_cleaned_filled.head())

  product_id               product_name brand_name  loves_count  rating  \
0    P473671    Fragrance Discovery Set      19-69         6320  3.6364   
1    P473668    La Habana Eau de Parfum      19-69         3827  4.1538   
2    P473662  Rainbow Bar Eau de Parfum      19-69         3253  4.2500   
3    P473660       Kasbah Eau de Parfum      19-69         3018  4.4762   
4    P473658  Purple Haze Eau de Parfum      19-69         2691  3.2308   

   reviews  price_usd primary_category secondary_category  tertiary_category  
0     11.0       35.0        Fragrance  Value & Gift Sets  Perfume Gift Sets  
1     13.0      195.0        Fragrance              Women            Perfume  
2     16.0      195.0        Fragrance              Women            Perfume  
3     21.0      195.0        Fragrance              Women            Perfume  
4     13.0      195.0        Fragrance              Women            Perfume  


In [3]:
# Feature selection
selected_features = ['price_usd', 'rating', 'reviews']

# Define X (features) and y (target)
X = productsreviews_cleaned_filled[selected_features]
y = productsreviews_cleaned_filled['loves_count']


In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Train the Decision Tree Regressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)


In [6]:
# Predict on test set
y_pred = tree_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Decision Tree Regression - Mean Squared Error: {mse}")
print(f"Decision Tree Regression - R-squared: {r2}")

Decision Tree Regression - Mean Squared Error: 5450393036.033091
Decision Tree Regression - R-squared: -0.2467196368664606


# Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Evaluate the model
y_pred_rf = rf_regressor.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Regression - Mean Squared Error: {mse_rf}")
print(f"Random Forest Regression - R-squared: {r2_rf}")

# Define recommendation function
def recommend_products_with_rf(category, num_recommendations=5):
    filtered_products = productsreviews_cleaned_filled[productsreviews_cleaned_filled['tertiary_category'].str.contains(category, case=False, na=False)]
    
    if filtered_products.empty:
        print(f"No products found for category '{category}'.")
        return pd.DataFrame()
    
    X_filtered = filtered_products[selected_features]
    filtered_products['predicted_loves_count'] = rf_regressor.predict(X_filtered)
    
    top_products = filtered_products.sort_values(by='predicted_loves_count', ascending=False).head(num_recommendations)
    
    return top_products[['product_name', 'brand_name', 'price_usd', 'rating']]

# User interaction
def get_user_input_with_rf():
    while True:
        category = input("Enter the specific product type you're interested in (e.g., lipstick, serum): ")
        num_recommendations = int(input("How many recommendations would you like? "))
        
        recommendations = recommend_products_with_rf(category, num_recommendations)
        if not recommendations.empty:
            print(f"\nTop {num_recommendations} products in the specific category '{category}':")
            print(recommendations[['product_name', 'brand_name', 'price_usd', 'rating']])
        
        more_recs = input("Would you like more recommendations? (yes/no): ").strip().lower()
        if more_recs == 'yes':
            same_or_new = input("Would you like recommendations for the same category or a new one? (same/new): ").strip().lower()
            if same_or_new == 'same':
                continue
            elif same_or_new == 'new':
                continue
            else:
                print("Invalid input. Exiting.")
                break
        elif more_recs == 'no':
            print("Thank you for using the recommender system!")
            break
        else:
            print("Invalid input. Exiting.")
            break

# Main process
get_user_input_with_rf()


Random Forest Regression - Mean Squared Error: 2036515611.2064462
Random Forest Regression - R-squared: 0.5341686394924485


Enter the specific product type you're interested in (e.g., lipstick, serum):  lipstick
How many recommendations would you like?  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_products['predicted_loves_count'] = rf_regressor.predict(X_filtered)



Top 2 products in the specific category 'lipstick':
                         product_name               brand_name  price_usd  \
6434  Cream Lip Stain Liquid Lipstick       SEPHORA COLLECTION       15.0   
250                   Liquid Lipstick  Anastasia Beverly Hills       20.0   

      rating  
6434  4.3201  
250   3.8268  


Would you like more recommendations? (yes/no):  no


Thank you for using the recommender system!


# Pasting Method

In [9]:
# Feature selection
selected_features = ['price_usd', 'rating', 'reviews']

# Define X (features) and y (target)
X = productsreviews_cleaned_filled[selected_features]
y = productsreviews_cleaned_filled['loves_count']


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np

# Number of subsets and models
n_subsets = 10

# Create subsets of the data
subset_size = len(X) // n_subsets
subsets_X = []
subsets_y = []

for i in range(n_subsets):
    X_subset = X.iloc[i * subset_size:(i + 1) * subset_size]
    y_subset = y.iloc[i * subset_size:(i + 1) * subset_size]
    subsets_X.append(X_subset)
    subsets_y.append(y_subset)
    
# Train a Decision Tree model for each subset
trees = []
for X_subset, y_subset in zip(subsets_X, subsets_y):
    tree = DecisionTreeRegressor(random_state=42)
    tree.fit(X_subset, y_subset)
    trees.append(tree)


In [11]:
# Function to get average predictions from multiple models
def average_predictions(trees, X):
    predictions = np.array([tree.predict(X) for tree in trees])
    avg_predictions = np.mean(predictions, axis=0)
    return avg_predictions

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get average predictions on the test set
y_pred_pasting = average_predictions(trees, X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse_pasting = mean_squared_error(y_test, y_pred_pasting)
r2_pasting = r2_score(y_test, y_pred_pasting)
print(f"Pasting - Mean Squared Error: {mse_pasting}")
print(f"Pasting - R-squared: {r2_pasting}")

Pasting - Mean Squared Error: 1826009526.5866172
Pasting - R-squared: 0.5823196751407739


## Scores and Decision to use Pasting Method

Decision Tree Regression - Mean Squared Error: 5450393036.033091
Decision Tree Regression - R-squared: -0.2467196368664606

Random Forest Regression - Mean Squared Error: 2036515611.2064462
Random Forest Regression - R-squared: 0.5341686394924485

Pasting - Mean Squared Error: 1826009526.5866172
Pasting - R-squared: 0.5823196751407739

________________________________________

- The Decision Tree Regression model is not suitable for your recommender system due to its high error and poor explanatory power.
- The Random Forest model is recommended because it performs well and is a good candidate for your product recommender system. It provides a balance between accuracy and interpretability.
- The Pasting method is highly recommended as it outperforms both the Decision Tree and Random Forest in terms of both accuracy and explanatory power. It is the best choice for your product recommender system. The R-squared value of about 0.58 is the highest, suggesting that this model explains 58% of the variance in the target variable. This makes it the best model for predicting the loves_count based on your dataset.