In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from networkx.algorithms.community import (
    girvan_newman
)
from networkx import edge_betweenness_centrality as betweenness
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
sf_df = pd.read_csv('../data/software/Digital_Software_v1_00.tsv', sep='\t')

In [3]:
sf_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,17747349,R2EI7QLPK4LF7U,B00U7LCE6A,106182406,CCleaner Free [Download],Digital_Software,4,0,0,N,Y,Four Stars,So far so good,2015-08-31
1,US,10956619,R1W5OMFK1Q3I3O,B00HRJMOM4,162269768,ResumeMaker Professional Deluxe 18,Digital_Software,3,0,0,N,Y,Three Stars,Needs a little more work.....,2015-08-31
2,US,13132245,RPZWSYWRP92GI,B00P31G9PQ,831433899,Amazon Drive Desktop [PC],Digital_Software,1,1,2,N,Y,One Star,Please cancel.,2015-08-31
3,US,35717248,R2WQWM04XHD9US,B00FGDEPDY,991059534,Norton Internet Security 1 User 3 Licenses,Digital_Software,5,0,0,N,Y,Works as Expected!,Works as Expected!,2015-08-31
4,US,17710652,R1WSPK2RA2PDEF,B00FZ0FK0U,574904556,SecureAnywhere Intermet Security Complete 5 De...,Digital_Software,4,1,2,N,Y,Great antivirus. Worthless customer support,I've had Webroot for a few years. It expired a...,2015-08-31


In [4]:
sf_df.nunique()

marketplace               1
customer_id           94099
review_id            101836
product_id             2995
product_parent         2981
product_title          2639
product_category          1
star_rating               5
helpful_votes           283
total_votes             290
vine                      1
verified_purchase         2
review_headline       69680
review_body           98420
review_date            2198
dtype: int64

In [5]:
# Create a dictionary from train data
# key -> product
# value -> concatenated reviews by various users for the product
# Finally extract list of unique produccts and concatenated reviews from the dictionary
product_reviews_dict = {}
for index, row in sf_df.iterrows():
    if row['product_id'] in product_reviews_dict:
        product_reviews_dict[row['product_id']] = product_reviews_dict[row['product_id']] + " " + str(row['review_body'])
    else:
        product_reviews_dict[row['product_id']] = row['review_body']
products_list = list(product_reviews_dict.keys())
reviews_list = list(product_reviews_dict.values())

In [6]:
# Tokenize the review text using CountVectorizer to create a document term matrix (product vs words in review)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(reviews_list)

In [24]:
X.shape, X

((2995, 44728),
 <2995x44728 sparse matrix of type '<class 'numpy.float64'>'
 	with 827003 stored elements in Compressed Sparse Row format>)

In [7]:
svd = TruncatedSVD(n_components=2200, n_iter=5)
X_red = svd.fit_transform(X)

In [8]:
# Create an adjacency matrix for graph using kneighbors_graph with speciefied number of neighbors
X_graph_adj = kneighbors_graph(X_red, 10, mode='distance', n_jobs=-1)

In [9]:
# Create a networkX graph from the adjacency matrix
main_graph = nx.convert_matrix.from_scipy_sparse_array(X_graph_adj)

# Create a label mapping between products and indices created in the graph
label_mapping = dict(zip(main_graph.nodes(), products_list))

# Relabel the indices in the graph with product names 
main_graph = nx.relabel_nodes(main_graph, label_mapping)
print(main_graph)

Graph with 2995 nodes and 25894 edges


In [10]:
# nx.draw(main_graph, with_labels=True)

In [11]:
# function to calculate pagerank for the nodes in a community within a graph
def getPageRankOfCommunity(G, community_nodes):
    community_graph = G.subgraph(community_nodes)
    return nx.pagerank(community_graph, alpha=0.85, weight='weight')

In [12]:
# function to return the recommendation of a product using the communities specified
# product_comm_dict and community_pagerank_dict can be results of any community detection alorithm (K-clique or girvan-newman)
def getProductRecommendations(product, product_comm_dict, community_pagerank_dict):
    if product in product_comm_dict:
        recommendation_list = []
        comm_nodes = community_pagerank_dict[product_comm_dict[product]]
        comm_nodes = [(p, pr*cosine_similarity(X_red[list(products_list).index(p)].reshape(1,-1), X_red[list(products_list).index(product)].reshape(1,-1))) for p, pr in comm_nodes]
        comm_nodes = sorted(comm_nodes, key=lambda kv: kv[1], reverse=True)
        for product_id, pagerank in comm_nodes:
            if len(recommendation_list) >= 5:
                break
            elif product_id != product:
                recommendation_list.append(product_id)
            else:
                continue
        return recommendation_list
    else:
        return []

In [13]:
train_df, test_df = train_test_split(sf_df, test_size=0.02)

In [14]:
test_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
6660,US,43872710,RPRMICTTMKCUJ,B00LU2XR8E,550306665,Kaspersky Internet Security Multi-Device 2015 ...,Digital_Software,5,0,1,N,Y,Five Stars,ok,2015-06-09
52719,US,49162818,RABURIKZYOFW9,B00H9A60O4,608720080,Avast Free Antivirus 2015 [Download],Digital_Software,5,0,0,N,N,Works well,Avast works well. It is updated daily. It ha...,2014-06-21
78565,US,19732263,R1QS9D9OEXZTH0,B00B1TGUMG,284323980,Microsoft Office Home and Student 2013 (1PC/1U...,Digital_Software,1,7,8,N,N,"Hard on the eyes, Not user friendly!!",DO NOT PURCHASE THIS VERSION OF MICROSOFT!!<br...,2013-09-01
29915,US,12164590,R2SDLCXWDO19M7,B003ZK51W4,470091820,Quicken Deluxe 2011 - [Old Version],Digital_Software,5,0,0,N,Y,Five Stars,Stuck on Quicken,2015-02-03
69883,US,34168310,R14D1PLCVAKK5S,B00B1TGUMG,284323980,Microsoft Office Home and Student 2013 (1PC/1U...,Digital_Software,5,0,1,N,Y,Works Great,We have been using this for about 3 months now...,2014-01-21


In [15]:
def getRecommendedProductNames(recommendations, original_df):
    recommended_product_names = []
    for product_id in recommendations:
        recommended_product_names.append((original_df[original_df['product_id'] == product_id]['product_title'].iloc[0], product_id))
    return recommended_product_names

In [16]:
# define a function to evaluate the most valuable edge considering the weights of the edges
# then use the function in girvan newman algorith to remove the edge returned byb this function in each iteration
def most_central_edge(G):
    centrality = betweenness(G, weight='weight')
    return max(centrality, key=centrality.get)
gn_comm = girvan_newman(main_graph, most_valuable_edge=most_central_edge)

In [17]:
# get the communities using girvan newman algorithm
first_iteration_comm = tuple(sorted(c) for c in next(gn_comm))

# Transform the communities obtained into a dictionary with
# key -> communityId
# value -> list of nodes in the community
gn_dict = dict(enumerate(first_iteration_comm))

In [20]:
# Create a dictionary with
# key -> product
# value -> Id of the community the product belongs to
gn_product_comm_dict = {}
for comm, products in gn_dict.items():
    for product in products:
        gn_product_comm_dict[product] = comm

# Calculate pagerank for the nodes in each community and create a dictionary where
# key -> communityId
# value -> list of tuples (product, pagerank) sorted in descending order of pagerank
gn_community_pagerank_dict = {}
for comm, nodes in gn_dict.items():
    page_rank_dict = getPageRankOfCommunity(main_graph, nodes)
    gn_community_pagerank_dict[comm] = sorted(page_rank_dict.items(), key=lambda kv: kv[1], reverse=True)
# print(gn_community_pagerank_dict)

In [21]:
with open('../data/software/recommendations_gn.txt', 'w') as f:
    for index, row in test_df.iterrows():
        recommendations = getProductRecommendations(row['product_id'], gn_product_comm_dict, gn_community_pagerank_dict)
        rec_products = getRecommendedProductNames(recommendations, sf_df)
        f.write(f"Recommended items for Product: {row['product_title']} - {row['product_id']} are {rec_products} \n")