# Loading Dependencies and dataset

In [1]:
import pandas as pd

df = pd.read_csv('int_online_tx.csv')
print(df.head())

   InvoiceNo StockCode                      Description  Quantity  \
0     536370     22728        ALARM CLOCK BAKELIKE PINK        24   
1     536370     22727        ALARM CLOCK BAKELIKE RED         24   
2     536370     22726       ALARM CLOCK BAKELIKE GREEN        12   
3     536370     21724  PANDA AND BUNNIES STICKER SHEET        12   
4     536370     21883                 STARS GIFT TAPE         24   

    InvoiceDate  UnitPrice  CustomerID Country  
0  12/1/10 8:45       3.75     12583.0  France  
1  12/1/10 8:45       3.75     12583.0  France  
2  12/1/10 8:45       3.75     12583.0  France  
3  12/1/10 8:45       0.85     12583.0  France  
4  12/1/10 8:45       0.65     12583.0  France  


# Data Processing

### Eliminating data with item returns (negative quantity)

In [2]:
df = df.loc[df['Quantity'] > 0]

### Identify null components

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35116 entries, 0 to 35115
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    35116 non-null  int64  
 1   StockCode    35116 non-null  object 
 2   Description  35116 non-null  object 
 3   Quantity     35116 non-null  int64  
 4   InvoiceDate  35116 non-null  object 
 5   UnitPrice    35116 non-null  float64
 6   CustomerID   33698 non-null  float64
 7   Country      35116 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 2.4+ MB


In [4]:
# identify missing values
df.isnull().sum()

InvoiceNo         0
StockCode         0
Description       0
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID     1418
Country           0
dtype: int64

### Handling Nan CustomerID

In [5]:
# drop missing values
df['CustomerID'].isna().sum()
df = df.dropna(subset=['CustomerID'])

### Creating the customer-item matrix

In [6]:
customer_item_matrix = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum'
)
customer_item_matrix.loc[12481:].head()

StockCode,10002,10120,10125,10133,10135,11001,15034,15036,15039,15044A,...,90192,90201A,90201B,90201C,90201D,90202D,90204,C2,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12481.0,,,,,,,,36.0,,,...,,,,,,,,,,32.0
12483.0,,,,,,,,,,,...,,,,,,,,,,16.0
12484.0,,,,,,16.0,,,,,...,,,,,,,,,,21.0
12488.0,,,,,10.0,,,,,,...,,,,,,,,,,3.0
12489.0,,,,,,,,,,,...,,,,,,,,,,2.0


In [7]:
print(customer_item_matrix.shape)
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)

(414, 2574)


  customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)


# Collaborative Filtering

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

## User based collaborative filtering

In [9]:
user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,404,405,406,407,408,409,410,411,412,413
0,1.0,0.063022,0.04613,0.047795,0.038484,0.0,0.025876,0.136641,0.094742,0.060262,...,0.013167,0.0,0.167284,0.102876,0.0,0.154371,0.0,0.053635,0.035979,0.0
1,0.063022,1.0,0.024953,0.051709,0.027756,0.0,0.027995,0.118262,0.146427,0.0,...,0.02849,0.0,0.0,0.182125,0.0,0.037113,0.0,0.058026,0.0,0.0
2,0.04613,0.024953,1.0,0.056773,0.137137,0.0,0.030737,0.032461,0.144692,0.153389,...,0.046921,0.044237,0.056773,0.111091,0.0,0.081497,0.0,0.063709,0.064106,0.20937
3,0.047795,0.051709,0.056773,1.0,0.031575,0.0,0.0,0.0,0.033315,0.02119,...,0.06482,0.0,0.0,0.092082,0.0,0.06333,0.0,0.033005,0.088561,0.0
4,0.038484,0.027756,0.137137,0.031575,1.0,0.0,0.102568,0.036108,0.089414,0.068248,...,0.069589,0.0,0.052626,0.086499,0.0,0.124646,0.0,0.088582,0.095077,0.0


In [10]:
#Renaming index and column names

user_user_sim_matrix.columns = customer_item_matrix.index

user_user_sim_matrix['CustomerID'] = customer_item_matrix.index
user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')
user_user_sim_matrix.head()


CustomerID,12347.0,12348.0,12349.0,12350.0,12352.0,12353.0,12354.0,12355.0,12356.0,12357.0,...,16320.0,16321.0,17097.0,17404.0,17443.0,17444.0,17508.0,17828.0,17829.0,17844.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347.0,1.0,0.063022,0.04613,0.047795,0.038484,0.0,0.025876,0.136641,0.094742,0.060262,...,0.013167,0.0,0.167284,0.102876,0.0,0.154371,0.0,0.053635,0.035979,0.0
12348.0,0.063022,1.0,0.024953,0.051709,0.027756,0.0,0.027995,0.118262,0.146427,0.0,...,0.02849,0.0,0.0,0.182125,0.0,0.037113,0.0,0.058026,0.0,0.0
12349.0,0.04613,0.024953,1.0,0.056773,0.137137,0.0,0.030737,0.032461,0.144692,0.153389,...,0.046921,0.044237,0.056773,0.111091,0.0,0.081497,0.0,0.063709,0.064106,0.20937
12350.0,0.047795,0.051709,0.056773,1.0,0.031575,0.0,0.0,0.0,0.033315,0.02119,...,0.06482,0.0,0.0,0.092082,0.0,0.06333,0.0,0.033005,0.088561,0.0
12352.0,0.038484,0.027756,0.137137,0.031575,1.0,0.0,0.102568,0.036108,0.089414,0.068248,...,0.069589,0.0,0.052626,0.086499,0.0,0.124646,0.0,0.088582,0.095077,0.0


In [11]:
user_user_sim_matrix.loc[12350.0].sort_values(ascending=False).head(10)

CustomerID
12350.0    1.000000
12414.0    0.181902
12652.0    0.175035
12603.0    0.171499
12814.0    0.171499
12791.0    0.171499
12475.0    0.161690
12735.0    0.157378
12449.0    0.156290
12684.0    0.155268
Name: 12350.0, dtype: float64

### Making Recommendations

In [12]:
user_user_sim_matrix.loc[12350.0].sort_values(ascending=False)
items_bought_by_A = customer_item_matrix.loc[12350.0][customer_item_matrix.loc[12350.0]>0]
print("Items Bought by A: ")
print(items_bought_by_A)

Items Bought by A: 
StockCode
20615     1
20652     1
21171     1
21832     1
21864     1
21866     1
21908     1
21915     1
22348     1
22412     1
22551     1
22557     1
22620     1
79066K    1
79191C    1
84086C    1
POST      1
Name: 12350.0, dtype: int64


In [15]:
items_bought_by_B = customer_item_matrix.loc[17935.0][customer_item_matrix.loc[17935.0]>0]
print("Items bought by B:")
print(items_bought_by_B)

print()

items_to_recommend_to_B = set(items_bought_by_A.index) - set(items_bought_by_B.index)
print("Items to Recommend to B ")
print(items_to_recommend_to_B)
df.loc[df['StockCode'].isin(items_to_recommend_to_B),['StockCode', 'Description']].drop_duplicates().set_index('StockCode')

KeyError: 17935

## Item-based collaborative filtering

In [17]:
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))
item_item_sim_matrix.columns = customer_item_matrix.T.index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index
item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')

In [18]:
print(item_item_sim_matrix)

StockCode     10002     10120     10125    10133    10135     11001    15034  \
StockCode                                                                      
10002      1.000000  0.000000  0.111111  0.00000  0.00000  0.000000  0.00000   
10120      0.000000  1.000000  0.000000  0.00000  0.00000  0.000000  0.00000   
10125      0.111111  0.000000  1.000000  0.00000  0.00000  0.000000  0.00000   
10133      0.000000  0.000000  0.000000  1.00000  0.25000  0.000000  0.00000   
10135      0.000000  0.000000  0.000000  0.25000  1.00000  0.188982  0.00000   
...             ...       ...       ...      ...      ...       ...      ...   
90202D     0.000000  0.000000  0.333333  0.00000  0.00000  0.000000  0.00000   
90204      0.000000  0.000000  0.333333  0.00000  0.00000  0.000000  0.00000   
C2         0.000000  0.000000  0.000000  0.00000  0.00000  0.169031  0.00000   
M          0.000000  0.242536  0.080845  0.00000  0.00000  0.000000  0.00000   
POST       0.152693  0.000000  0.171780 

### Making Recommendations

In [20]:
top_10_similar_items = list(item_item_sim_matrix.loc[23166.0].sort_values(ascending=False).iloc[:10].index)

print(top_10_similar_items)
print()
print(df.loc[
    df['StockCode'].isin(top_10_similar_items),
    ['StockCode', 'Description']
].drop_duplicates().set_index('StockCode').loc[top_10_similar_items])

KeyError: 23166.0