In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the data
df = pd.read_csv('data test.csv', sep=',', encoding='iso-8859-1')

# Preprocess Data
df = df.dropna(subset=['Description'])
df['Description'] = df['Description'].astype(str)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Filter data for the latest month (test) and the three months before that (train)
latest_date = df['InvoiceDate'].max()
latest_month = latest_date.month
latest_year = latest_date.year

# Filter for testing data (latest month)
test_df = df[(df['InvoiceDate'].dt.year == latest_year) & (df['InvoiceDate'].dt.month == latest_month)]

# Filter for training data (three months before the latest month)
train_df = df[(df['InvoiceDate'] < pd.Timestamp(latest_year, latest_month, 1)) & 
              (df['InvoiceDate'] >= pd.Timestamp(latest_year, latest_month-3, 1))]

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_df['Description'])


In [2]:
# Compute Cosine Similarity
cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)

# Create a DataFrame to map product descriptions to indices in the training data
train_indices = pd.Series(train_df.index, index=train_df['Description']).to_dict()

In [5]:
# Create a DataFrame for storing input and recommendations
recommendation_results = pd.DataFrame(columns=['input', 'rec1', 'rec2', 'rec3'])


In [27]:
def recommend_products(input_product, top_n=3):
    if input_product not in train_indices:
        return [None, None, None]
    
    idx = train_indices[input_product]
    sim_scores = list(enumerate(cosine_sim_train[idx]))
    
    # Remove the input product from the list of scores
    sim_scores = [score for score in sim_scores if score[0] != idx]
    
    # Sort and select top N recommendations
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]
    
    product_indices = [i[0] for i in sim_scores]
    recommended_products = train_df.iloc[product_indices]
    
    return recommended_products['Description'].tolist()

# Create a list to store results
recommendation_results = []

# Generate recommendations for each product in the training set
for input_product in train_df['Description'].unique():
    recommendations = recommend_products(input_product)
    recommendation_results.append({
        'input': input_product,
        'rec1': recommendations[0] if len(recommendations) > 0 else None,
        'rec2': recommendations[1] if len(recommendations) > 1 else None,
        'rec3': recommendations[2] if len(recommendations) > 2 else None
    })

# Convert the list to a DataFrame
recommendation_results_df = pd.DataFrame(recommendation_results)


In [28]:
recommendation_results_df

Unnamed: 0,input,rec1,rec2,rec3
0,PINK POLKADOT BOWL,PINK POLKADOT BOWL,PINK POLKADOT BOWL,PINK POLKADOT BOWL
1,PARTY CONES CANDY ASSORTED,PARTY CONES CANDY ASSORTED,PARTY CONES CANDY ASSORTED,PARTY CONES CANDY ASSORTED
2,REGENCY CAKESTAND 3 TIER,REGENCY CAKESTAND 3 TIER,REGENCY CAKESTAND 3 TIER,REGENCY CAKESTAND 3 TIER
3,FAIRY CAKE DESIGN UMBRELLA,FAIRY CAKE DESIGN UMBRELLA,FAIRY CAKE DESIGN UMBRELLA,FAIRY CAKE DESIGN UMBRELLA
4,SET OF 3 REGENCY CAKE TINS,SET OF 3 REGENCY CAKE TINS,SET OF 3 REGENCY CAKE TINS,SET OF 3 REGENCY CAKE TINS
...,...,...,...,...
3353,AMETHYST HOOP EARRING FLORAL LEAF,CRYSTAL HOOP EARRING FLORAL LEAF,CRYSTAL HOOP EARRING FLORAL LEAF,CRYSTAL HOOP EARRING FLORAL LEAF
3354,SET 10 CARDS SWIRLY XMAS TREE 17104,SET 10 XMAS CARDS & BADGES 17070,SET 10 XMAS CARDS & BADGES 17070,SET 10 XMAS CARDS & BADGES 17070
3355,wet?,wet rusty,wet rusty,wet pallet
3356,lost??,???lost,lost in space,PINK POLKADOT BOWL


In [29]:
recommendation_results_df.iloc[50:100]

Unnamed: 0,input,rec1,rec2,rec3
50,SWEETHEART CERAMIC TRINKET BOX,SWEETHEART CERAMIC TRINKET BOX,SWEETHEART CERAMIC TRINKET BOX,SWEETHEART CERAMIC TRINKET BOX
51,STRAWBERRY CERAMIC TRINKET BOX,STRAWBERRY CERAMIC TRINKET BOX,STRAWBERRY CERAMIC TRINKET BOX,STRAWBERRY CERAMIC TRINKET BOX
52,SET/5 RED RETROSPOT LID GLASS BOWLS,SET/5 RED RETROSPOT LID GLASS BOWLS,SET/5 RED RETROSPOT LID GLASS BOWLS,SET/5 RED RETROSPOT LID GLASS BOWLS
53,PACK OF 20 NAPKINS PANTRY DESIGN,PACK OF 20 NAPKINS PANTRY DESIGN,PACK OF 20 NAPKINS PANTRY DESIGN,PACK OF 20 NAPKINS PANTRY DESIGN
54,PLASTERS IN TIN SKULLS,PLASTERS IN TIN SKULLS,PLASTERS IN TIN SKULLS,PLASTERS IN TIN SKULLS
55,GRAND CHOCOLATECANDLE,GRAND CHOCOLATECANDLE,GRAND CHOCOLATECANDLE,GRAND CHOCOLATECANDLE
56,CREAM SLICE FLANNEL PINK SPOT,CREAM SLICE FLANNEL PINK SPOT,CREAM SLICE FLANNEL PINK SPOT,CREAM SLICE FLANNEL PINK SPOT
57,DRAWER KNOB CRACKLE GLAZE IVORY,DRAWER KNOB CRACKLE GLAZE IVORY,DRAWER KNOB CRACKLE GLAZE IVORY,DRAWER KNOB CRACKLE GLAZE IVORY
58,DRAWER KNOB CERAMIC BLACK,DRAWER KNOB CERAMIC BLACK,DRAWER KNOB CERAMIC BLACK,DRAWER KNOB CERAMIC BLACK
59,DRAWER KNOB CERAMIC IVORY,DRAWER KNOB CERAMIC IVORY,DRAWER KNOB CERAMIC IVORY,DRAWER KNOB CERAMIC IVORY
