In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
     

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-transactions-dataset/Products.csv
/kaggle/input/ecommerce-transactions-dataset/Customers.csv
/kaggle/input/ecommerce-transactions-dataset/Transactions.csv


In [2]:
Customers_data = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset/Customers.csv")
Products_data = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset/Products.csv")
Transactions_data = pd.read_csv("/kaggle/input/ecommerce-transactions-dataset/Transactions.csv")

In [3]:
Transactions_products = pd.merge(Transactions_data, Products_data, on='ProductID', how='left')
Merged_data = pd.merge(Transactions_products, Customers_data, on='CustomerID', how='left')


In [4]:
Customers_spending = Merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
Customers_spending.columns = ['CustomerID', 'TotalSpent']

Customers_quantity = Merged_data.groupby('CustomerID')['Quantity'].mean().reset_index()
Customers_quantity.columns = ['CustomerID', 'AvgQuantity']

Customers_favorite_category = Merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Counts')
Customers_favorite_category = Customers_favorite_category.loc[Customers_favorite_category.groupby('CustomerID')['Counts'].idxmax()]
Customers_favorite_category = Customers_favorite_category[['CustomerID', 'Category']]


In [5]:
Customers_value = pd.get_dummies(Customers_data, columns=['Region'], drop_first=True)

In [6]:
Customers_profile = pd.merge(Customers_spending, Customers_quantity, on='CustomerID')
Customers_profile = pd.merge(Customers_profile, Customers_favorite_category, on='CustomerID')     

In [7]:
scaler = StandardScaler()
Customers_profile_scaled = scaler.fit_transform(Customers_profile[['TotalSpent', 'AvgQuantity']])

In [8]:
Similarity_matrix = cosine_similarity(Customers_profile_scaled)

In [9]:
top_n = 3
Lookalike_map = {}

for idx, Customer_id in enumerate(Customers_profile['CustomerID']):
    Similarity_scores = list(enumerate(Similarity_matrix[idx]))

    Similarity_scores = sorted(Similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    Similar_customers = [(Customers_profile.iloc[i[0]]['CustomerID'], round(i[1], 4)) for i in Similarity_scores]
    Lookalike_map[Customer_id] = Similar_customers

In [10]:
Lookalike_results = []

for Customer_id in Customers_profile['CustomerID'][:20]:
    Lookalike_results.append([Customer_id, Lookalike_map[Customer_id]])

Lookalike_df = pd.DataFrame(Lookalike_results, columns=['CustomerID', 'Lookalikes'])
Lookalike_df.to_csv('Debjani_Ghosh_Lookalike.csv', index=False)