#### Network Analysis on "online retail.csv".

In [1]:
import pandas as pd
import networkx as nx
from itertools import combinations

###read the csv
x = pd.read_csv('Online Retail.csv')

### collapse to a series of lists
data = x.groupby('InvoiceNo').apply(lambda x: list(x['StockCode']))
### we can ignore 1 length baskets - no connections
data = data[data.apply(lambda x:len(x) > 1)]

#holding dict
mydict = {}

for i in range(len(data)):
    #2 combinations of baskets
    for j,k in combinations(data[i],2):
        #setdefauly, and append a 1 for each time it is found
        mydict.setdefault((j,k),[]).append(1)

#make a graph
G = nx.Graph()
#add the edges, if weight is greater than 20 - ie 20 transactions
_ = [G.add_edge(i[0], i[1], weight = len(j)) for i,j in mydict.items() if len(j) > 20]


In [2]:
x.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 8:26,3.39,17850.0,United Kingdom


In [3]:
nx.draw(G, with_labels=True)

#### What are the ten most important items by pagerank?


In [4]:
dict_G = nx.pagerank(G)

# Organizing by pagerank value and then sort by descending
sorted_G = sorted(dict_G.items(), key=lambda kv: kv[1], reverse=True)

for i in range(10):
    print(i, x['Description'][x['StockCode']==sorted_G[i][0]].iloc[0])

0 DOTCOM POSTAGE
1 WHITE HANGING HEART T-LIGHT HOLDER
2 JUMBO BAG RED RETROSPOT
3 REGENCY CAKESTAND 3 TIER
4 SMALL POPCORN HOLDER
5 JAM MAKING SET PRINTED
6 PACK OF 72 RETROSPOT CAKE CASES
7 GREEN REGENCY TEACUP AND SAUCER
8 SUKI  SHOULDER BAG
9 NATURAL SLATE HEART CHALKBOARD 


#### Find the network's connectedness and how many subclusters there are (if any).

In [5]:
len(list(nx.connected_component_subgraphs(G)))

12

#### Find cliques of items where the size is at least 5

In [6]:
z = nx.find_cliques(G)

z_list = []
while len(z_list) < 5:
    z_i = next(z)
    if len(z_i) >= 5:
        z_list.append(z_i)

z_list

[['22093', '85123A', '21034', '22720', '23203', '84596B', '22197'],
 ['22093',
  '85123A',
  '21034',
  '22720',
  '22094',
  '22091',
  '22197',
  '22098',
  '84596B'],
 ['22093', '85123A', '21034', '22720', '22094', '22091', '22197', '22099'],
 ['22093', '85123A', '21034', '22720', '22094', '22091', '22097', '22098'],
 ['22093', '85123A', '22096', '22098', '22094', '22197']]

In [7]:
for j in range(len(z_list)):
    for i in range(len(z_list[j])):
        print(f'Cluster {j}', f'Item {i}', x['Description'][x['StockCode']==z_list[j][i]].iloc[0])
    print('')

Cluster 0 Item 0 MOTORING TISSUE BOX
Cluster 0 Item 1 WHITE HANGING HEART T-LIGHT HOLDER
Cluster 0 Item 2 REX CASH+CARRY JUMBO SHOPPER
Cluster 0 Item 3 SET OF 3 CAKE TINS PANTRY DESIGN 
Cluster 0 Item 4 mailout
Cluster 0 Item 5 SMALL DOLLY MIX DESIGN ORANGE BOWL
Cluster 0 Item 6 SMALL POPCORN HOLDER

Cluster 1 Item 0 MOTORING TISSUE BOX
Cluster 1 Item 1 WHITE HANGING HEART T-LIGHT HOLDER
Cluster 1 Item 2 REX CASH+CARRY JUMBO SHOPPER
Cluster 1 Item 3 SET OF 3 CAKE TINS PANTRY DESIGN 
Cluster 1 Item 4 RED RETROSPOT TISSUE BOX
Cluster 1 Item 5 EMPIRE TISSUE BOX
Cluster 1 Item 6 SMALL POPCORN HOLDER
Cluster 1 Item 7 BOUDOIR SQUARE TISSUE BOX
Cluster 1 Item 8 SMALL DOLLY MIX DESIGN ORANGE BOWL

Cluster 2 Item 0 MOTORING TISSUE BOX
Cluster 2 Item 1 WHITE HANGING HEART T-LIGHT HOLDER
Cluster 2 Item 2 REX CASH+CARRY JUMBO SHOPPER
Cluster 2 Item 3 SET OF 3 CAKE TINS PANTRY DESIGN 
Cluster 2 Item 4 RED RETROSPOT TISSUE BOX
Cluster 2 Item 5 EMPIRE TISSUE BOX
Cluster 2 Item 6 SMALL POPCORN HOLDER


#### Create a recommender function which used edge weights to suggest a random item, given an input item and network.

In [67]:
def recommender(item, network):
    neighbor_list = list(G.neighbors(item))
    
    # Identify item from stock id
    item_desc = x['Description'][x['StockCode']==item].iloc[0]
    
    edge_data = []
    
    # Finding edge weights for all the neighbors around the item
    for neighbor in neighbor_list:
        edge_data.append([int(*list(G.get_edge_data(item, neighbor).values())), neighbor])
    
    chosen_item = sorted(edge_data, reverse=True)[0][1]
    
    # Identify chosen item from stock id
    chosen_item_desc = x['Description'][x['StockCode']==chosen_item].iloc[0]
    
    print(f'Based on the input item, {item_desc}, ' + f'the suggested item is: {chosen_item_desc}')


In [71]:
recommender('22197', G)

Based on the input item, SMALL POPCORN HOLDER, the suggested item is: PACK OF 72 RETROSPOT CAKE CASES
