In [5]:
from enum import Enum
from typing import Dict, List
import sys
from neo4j import Transaction

from util.fixed_heapq import FixedHeap
from util.sparse_vector import cosine_similarity
from util.graphdb_base import GraphDBBase

In [1]:
from heapq import heappush, heappushpop, heappop


class FixedHeap(object):
    def __init__(self, capacity: int):
        self.heap = []
        self.capacity = capacity
        self.counter = 0

    def push(self, score, item):
        self.counter += 1
        # store items as max heap, removing the largest as capacity get reached
        if len(self.heap) < self.capacity:
            heappush(self.heap, (-score, -self.counter, item))
        else:
            heappushpop(self.heap, (-score, -self.counter, item))

    def items(self):
        return list(reversed([heappop(self.heap)[2] for i in range(len(self.heap))]))


In [2]:
from neo4j import GraphDatabase
import configparser
import os
import sys
import getopt

help_message = '-u <neo4j username> -p <password> -s <source directory> -b <bolt uri>'

neo4j_user = 'neo4j'
neo4j_password = 'password'
source_dataset_path = ''
uri = 'bolt://localhost:7687'


class GraphDBBase():
    def __init__(self, command=None, argv=None, extended_options='', extended_long_options=[]):
        self.uri = None
        self.neo4j_user = None
        self.neo4j_password = None
        self.source_dataset_path = None
        self.opts = {}
        self.args = []

        if argv:
            self.__get_main_parameters__(command=command, argv=argv, extended_options=extended_options,
                                         extended_long_options=extended_long_options)

        config = configparser.ConfigParser()
        config_file = os.path.join(os.path.dirname(__file__), '..', 'config.ini')
        config.read(config_file)
        neo4j_params = config['neo4j']

        uri = self.uri or os.getenv('NEO4J_URI') or neo4j_params.get('uri', 'bolt://localhost:7687')
        user = self.neo4j_user or os.getenv('NEO4J_USER') or neo4j_params.get('user', 'neo4j')
        password = self.neo4j_password or os.getenv('NEO4J_PASSWORD') or neo4j_params.get('password', 'password')
        ignored_params = {'uri', 'user', 'password'}
        param_converters = {'encrypted': lambda x: int(x)}

        def maybe_convert(key: str, value: str):
            if key in param_converters:
                return param_converters[key](value)
            return value

        other_params = dict([(key, maybe_convert(key, value)) for key, value in neo4j_params.items()
                             if key not in ignored_params])
        # print(other_params)

        self._driver = GraphDatabase.driver(uri, auth=(user, password), **other_params)
        self._session = None

    def get_opts(self):
        return self.opts

    def get_option(self, options: list, default = None):
        for opt, arg in self.opts:
            if opt in options:
                return arg

        return default

    def close(self):
        self._driver.close()

    def get_session(self):
        return self._driver.session()

    def execute_without_exception(self, query: str):
        try:
            self.get_session().run(query)
        except Exception as e:
            pass

    def executeNoException(self, session, query: str):
        try:
            session.run(query)
        except Exception as e:
            pass

    def __get_main_parameters__(self, command, argv, extended_options='', extended_long_options=[]):
        try:
            self.opts, self.args = getopt.getopt(argv, 'hu:p:s:b:' + extended_options,
                                       ['help', 'neo4j-user=', 'neo4j-password=', 'source-path=',
                                        'bolt='] + extended_long_options)
        except getopt.GetoptError as e:
            print(e)
            print(command, help_message)
            sys.exit(2)
        for opt, arg in self.opts:
            if opt == '-h':
                print(command, help_message)
                sys.exit()
            elif opt in ("-u", "--neo4j-user"):
                self.neo4j_user = arg
            elif opt in ("-p", "--neo4j-password"):
                self.neo4j_password = arg
            elif opt in ("-s", "--source-path"):
                self.source_dataset_path = arg
            elif opt in ("-b", "--bolt"):
                self.uri = arg


In [3]:
import math

def convert_sparse_vector(numbers):
    vector_dict = {}
    for k, c in enumerate(numbers):
        if c:
            vector_dict[k] = c
    return vector_dict

def cosine_similarity(vectA, vectB):
    a = dot(vectA, vectB);
    b = norm(vectA) * norm(vectB);
    if b > 0:
        return a / b;
    else:
        return 0


def dot(vect_a, vect_b):
    if vect_a is None \
            or vect_b is None\
            or len(vect_a) == 0 \
            or len(vect_b) == 0:
        return 0

    dot_value = 0.0
    x_index = 0
    y_index = 0

    while True:
        if vect_a[x_index] == vect_b[y_index]:
            dot_value += 1
            x_index += 1
            y_index += 1
        elif vect_a[x_index] > vect_b[y_index]:
            y_index += 1
        else:
            x_index += 1

        if x_index == len(vect_a) or y_index == len(vect_b):
            break

    return dot_value


def norm(vect):
    return math.sqrt(dot(vect, vect))


if __name__ == '__main__':
    print(convert_sparse_vector([1,0,0,1,0,0])) #{0: 1, 3: 1}
    print(convert_sparse_vector([1, 1, 0, 0, 0, 0])) #{0: 1, 1: 1}
    print(convert_sparse_vector([1, 1, 0, 0, 0, 1])) #{0: 1, 1: 1, 5: 1}

{0: 1, 3: 1}
{0: 1, 1: 1}
{0: 1, 1: 1, 5: 1}


In [6]:
class BaseRecommender(GraphDBBase):
    label = None
    property = None
    sparse_vector_query = None
    score_query = None

    def __init__(self, argv):
        super().__init__(command=__file__, argv=argv)

    def compute_and_store_KNN(self, size: int) -> None:
        print("fetching vectors")
        vectors = self.get_vectors()
        print(f"computing KNN for {len(vectors)} vectors")
        for i, (key, vector) in enumerate(vectors.items()):
            # index only vectors
            vector = sorted(vector.keys())
            knn = FixedHeap(size)
            for (other_key, other_vector) in vectors.items():
                if key != other_key:
                    # index only vectors
                    other_vector = sorted(other_vector.keys())
                    score = cosine_similarity(vector, other_vector)
                    if score > 0:
                        knn.push(score, {"secondNode": other_key, "similarity": score})
            self.store_KNN(key, knn.items())
            if (i % 1000 == 0) and i > 0:
                print(f"{i} vectors processed...")
        print("KNN computation done")

    def get_vectors(self) -> Dict:
        with self._driver.session() as session:
            tx = session.begin_transaction()
            ids = self.get_elements(tx)
            vectors = {id_: self.get_sparse_vector(tx, id_) for id_ in ids}
        return vectors

    def get_elements(self, tx) -> List[str]:
        query = f"MATCH (u:{self.label}) RETURN u.{self.property} as id"
        result = tx.run(query).value()
        return result

    def get_sparse_vector(self, tx: Transaction, current_id: str) -> Dict[int, float]:
        params = {"id": current_id}
        result = tx.run(self.sparse_vector_query, params)
        return dict(result.values())

    def store_KNN(self, key: str, sims: List[Dict]) -> None:
        deleteQuery = f"""
            MATCH (n:{self.label})-[s:SIMILARITY]->()
            WHERE n.{self.property} = $id
            DELETE s"""

        query = f"""
            MATCH (n:{self.label}) 
            WHERE n.{self.property} = $id 
            UNWIND $sims as sim
            MATCH (o:{self.label}) 
            WHERE o.{self.property} = sim.secondNode 
            CREATE (n)-[s:SIMILARITY {{ value: toFloat(sim.similarity) }}]->(o)"""

        with self._driver.session() as session:
            tx = session.begin_transaction()
            params = {
                "id": key,
                "sims": sims}
            tx.run(deleteQuery, params)
            tx.run(query, params)
            tx.commit()

    def get_recommendations(self, user_id: str, size: int) -> List[int]:
        not_seen_yet_items = self.get_not_seen_yet_items(user_id)
        recommendations = FixedHeap(size)
        for item in not_seen_yet_items:
            score = self.get_score(user_id, item)
            recommendations.push(score, item)
        return recommendations.items()

    def get_not_seen_yet_items(self, user_id: str) -> List[int]:
        query = """
                MATCH (user:User {userId:$userId})
                WITH user
                MATCH (item:Item)
                WHERE NOT EXISTS((user)-[:PURCHASES]->(item))
                return item.itemId
        """
        with self._driver.session() as session:
            tx = session.begin_transaction()
            params = {"userId": user_id}
            result = tx.run(query, params).value()
        return result

    def get_score(self, user_id: str, item_id: str) -> float:
        with self._driver.session() as session:
            tx = session.begin_transaction()
            params = {"userId": user_id, "itemId": item_id}
            result = tx.run(self.score_query, params)
            result = result.value() + [0.0]
        return result[0]


class UserRecommender(BaseRecommender):
    label = "User"
    property = "userId"
    sparse_vector_query = """
        MATCH (u:User {userId: $id})-[:PURCHASES]->(i:Item)
        return id(i) as index, 1.0 as value
        order by index
    """
    score_query = """
        MATCH (user:User)-[:SIMILARITY]->(otherUser:User)
        WHERE user.userId = $userId
        WITH otherUser, count(otherUser) as size
        MATCH (otherUser)-[r:PURCHASES]->(target:Target)
        WHERE target.itemId = $itemId
        return (+1.0/size)*count(r) as score
    """

    def __init__(self, argv):
        super().__init__(argv=argv)


class ItemRecommender(BaseRecommender):
    label = "User"
    property = "userId"
    sparse_vector_query = """
        MATCH (u:User )-[:PURCHASES]->(i:Item {itemId: $id})
        return id(u) as index, 1.0 as value
        order by index
    """
    score_query = """
        MATCH (user:User)-[:PURCHASES]->(item:Item)-[r:SIMILARITY]->(target:Item)
        WHERE user.userId = $userId AND target.itemId = $itemId
        return sum(r.value) as score
    """

    def __init__(self, argv):
        super().__init__(argv=argv)


class Recommender(GraphDBBase):
    class KNNType(Enum):
        USER = 1
        ITEM = 2

    def __init__(self, argv):
        super().__init__(command=__file__, argv=argv)
        self.strategies: Dict[Recommender.KNNType, BaseRecommender] = {
            Recommender.KNNType.USER: UserRecommender(argv),
            Recommender.KNNType.ITEM: ItemRecommender(argv)
        }

    def compute_and_store_KNN(self, type_: KNNType) -> None:
        strategy = self.strategies[type_]
        strategy.compute_and_store_KNN(20)

    def clean_KNN(self):
        print("cleaning previously computed KNNs")
        delete_query = "MATCH p=()-[r:SIMILARITY]->() DELETE r"
        with self._driver.session() as session:
            tx = session.begin_transaction()
            tx.run(delete_query)
            tx.commit()

    def get_recommendations(self, user_id: str, size: int, type_: KNNType):
        strategy = self.strategies[type_]
        return strategy.get_recommendations(user_id, size)



In [8]:

def main():
    # TODO: pass the user ID in the command-line
    recommender = Recommender(sys.argv[1:])
    recommender.clean_KNN()
    recommender.compute_and_store_KNN(recommender.KNNType.USER)
    user_id = "121688"
    print(f"User-based recommendations for user {user_id}")
    recommendations = recommender.get_recommendations(user_id, 10, recommender.KNNType.USER)
    print(recommendations)
    recommender.clean_KNN()
    recommender.compute_and_store_KNN(recommender.KNNType.ITEM)
    user_id = "121688"
    print(f"Item-based recommendations for user {user_id}")
    recommendations = recommender.get_recommendations(user_id, 10, recommender.KNNType.ITEM)
    print(recommendations)

main()

NameError: name '__file__' is not defined

In [None]:
import csv
import time
import os
import sys


from util.graphdb_base import GraphDBBase
from util.string_util import strip


class RetailRocketImporter(GraphDBBase):

    def __init__(self, argv):
        super().__init__(command=__file__, argv=argv)

    def import_user_item(self, file):
        with open(file, 'r+') as in_file:
            reader = csv.reader(in_file, delimiter=',')
            next(reader, None)
            with self._driver.session() as session:
                self.execute_without_exception("CREATE CONSTRAINT ON (u:User) ASSERT u.userId IS UNIQUE")
                self.execute_without_exception("CREATE CONSTRAINT ON (u:Item) ASSERT u.itemId IS UNIQUE")

                tx = session.begin_transaction()
                i = 0
                j = 0
                query = """
                    MERGE (item:Item {itemId: $itemId})
                    MERGE (user:User {userId: $userId})
                    MERGE (user)-[:PURCHASES { timestamp: $timestamp}]->(item)
                """
                for row in reader:
                    try:
                        if row:
                            timestamp = strip(row[0])
                            user_id = strip(row[1])
                            event_type = strip(row[2])
                            item_id = strip(row[3])

                            if event_type == "transaction":
                                tx.run(query, {"itemId": item_id, "userId": user_id, "timestamp": timestamp})
                                i += 1
                                j += 1
                                if i == 1000:
                                    tx.commit()
                                    print(j, "lines processed")
                                    i = 0
                                    tx = session.begin_transaction()
                    except Exception as e:
                        print(e, row, reader.line_num)
                tx.commit()
                print(j, "lines processed")


if __name__ == '__main__':
    start = time.time()
    importing = RetailRocketImporter(argv=sys.argv[1:])
    base_path = importing.source_dataset_path
    if not base_path:
        base_path = "" #"../../../dataset/retailrocket/"
    file_path = os.path.join(base_path, "events.csv")
    importing.import_user_item(file=file_path)
    end = time.time() - start
    print("Time to complete:", end)


In [9]:
import pandas as pd

In [15]:
os.getcwd()

'C:\\Users\\bgran\\OneDrive\\Pulpit\\Thirdroom\\gpml\\ch05\\recommendation\\collaborative_filtering'

In [17]:
df = pd.read_csv('bookmarks.csv')

In [32]:
df

Unnamed: 0,umeta_id,user_id,meta_key,meta_value
0,770,1,bookmarked_projects,a:14:{i:0;i:218;i:1;i:224;i:2;i:494;i:3;i:259;...
1,771,34,bookmarked_projects,a:49:{i:0;i:224;i:1;i:218;i:2;i:216;i:4;i:259;...
2,795,32,bookmarked_projects,a:186:{i:0;i:241;i:2;i:609;i:4;i:668;i:5;i:772...
3,868,35,bookmarked_projects,a:1:{i:0;i:784;}
4,878,46,bookmarked_projects,a:32:{i:0;i:1150;i:1;i:1204;i:2;i:623;i:3;i:28...
...,...,...,...,...
480,26033,1188,bookmarked_projects,a:1:{i:0;i:87473;}
481,26034,1302,bookmarked_projects,a:1:{i:0;i:87501;}
482,26043,1172,bookmarked_projects,a:1:{i:0;i:87414;}
483,26070,1196,bookmarked_projects,a:0:{}


In [33]:
df.meta_value[483]

'a:0:{}'

In [30]:
import re

In [36]:
if re.findall(r'(?<=i:)\d+', df.meta_value[0]):
    print("a")

a


In [49]:
df_res = pd.DataFrame(columns = ['user_id', 'bookmark'])
for index, row in df.iterrows():
    if re.findall(r'(?<=i:)\d+', row['meta_value']):
        df_res = pd.concat([df_res, pd.DataFrame({
            'user_id': row['user_id'], 
            'bookmark': re.findall(r'(?<=i:)\d+', row['meta_value'])
        })])

In [56]:
df_res[df_res.user_id == 824]

Unnamed: 0,user_id,bookmark
1,824,71655
3,824,72451
5,824,67257
7,824,71333
9,824,71979
11,824,68078
13,824,66721
15,824,69536
17,824,67044
19,824,66727


In [53]:
df_res = df_res.iloc[1::2, :]

In [54]:
df_res.to_csv('bookmarks_clean.csv')