In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import networkx
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [2]:
PROJECT_DIR = Path().cwd().parent
DATA_DIR = PROJECT_DIR / "data"

In [17]:
class ProcessData():
    def __init__(self):
        pass

    def read_file(self, file: str, file_type: str):
        """Takes a filename(excel or csv), Returns a pandas dataframe object."""
        if file_type == "excel":
            df = pd.read_excel(file)
        elif file_type == "csv":
            df = pd.read_csv(file)
    
        return df
    
    def get_user_and_items(self, dataframe, user_col: str, item_col: str):
        """Takes a dataframe containing user-item interactions, Returns item list purchased by each user."""
        user_and_items = {}
        for name, group in df.loc[:,[user_col, item_col]].groupby(by=user_col):
            user_and_items.update({name:group.loc[:,item_col].to_list()})

        return user_and_items

    def transform_text_data(self, corpus: list):
        """Takes a list of tokens, Returns tf-idf vectors."""
        pipe = Pipeline([
                    ("count", CountVectorizer(
                                ngram_range=(1,1), 
                                token_pattern=r"[a-zA-Z0-9ıIiİğĞçÇöÖüÜşŞ]+", 
                                strip_accents="unicode",
                                lowercase=True)),
                    ("tfidf", TfidfTransformer())
                ])
        
        return pipe.fit_transform(corpus)

    def compute_similarity(self, first: list, second: list):
        """Takes two array of numbers, Returns cosine similarity score between these two arrays."""
        from numpy.linalg import norm
        similarity_score = np.dot(first, second)/(norm(first)*norm(second))
        # from sklearn.metrics.pairwise import cosine_similarity
        # cosine_similarity(first.reshape(1,-1), second.reshape(1,-1))
        return similarity_score

processor = ProcessData()

In [5]:
file_name = "generic_rec_dataset_Ekim_New_data.xlsx"
file_path = DATA_DIR / file_name
df = processor.read_file(file_path, file_type="excel")


In [97]:
pd.set_option("display.max_colwidth", 500)
# df.loc[:, ["USER", "ITEM"]].groupby(by="USER").count().sort_values(by="ITEM", ascending=False).reset_index()
user_ids = df.loc[:, "USER"].unique()
item_ids = df.loc[:, "ITEM"].unique()
print("Number of Users: {users}\nNumber of items: {items}".format(users=len(user_ids), items=len(item_ids)))

Number of Users: 97
Number of items: 465


In [7]:
user_and_items = processor.get_user_and_items(df, "USER", "ITEM")

In [21]:
corpus = [
    'İş Modelinizi Yeniden Düşünün',
    'Dijital Dünyada Müşteri Merkezinde Kalmak',
    'Dijital Dönüşümün Sektörler Üzerinde Etkileri - Video',
    'Dijital Dönüşümün Kurumlara Etkisi - Okuma Materyali - vİdeo',
    'KVKK Uyum Programı İçin Hazır Mısınız? için',
    'Detaylarıyla KiŞisel Verilerin Korunması Kanunu',
    'İK Departmanları İçin Kişisel Verilerin Korunması Kanunu',
    'İSG 360° Eğitim Programı - 6331 Sayılı İş Sağlığı ve Güvenliği Kanunu IĞDIR',
]
# vec = CountVectorizer(ngram_range=(1,1), token_pattern=r"[a-zA-Z0-9ıIiİğĞçÇöÖüÜşŞ]+", strip_accents="unicode", lowercase=True)
# x = vec.fit_transform(corpus)
# vec.get_feature_names()
# vec.vocabulary_

In [23]:
y = processor.transform_text_data(corpus)
similarity_score = processor.compute_similarity(y.toarray()[2], y.toarray()[3])
similarity_score

0.35668304960107816

In [102]:
from itertools import permutations, combinations
item_pairs = []
user_pairs = []
for items in user_and_items.values():
    item_pairs.extend([item_pair for item_pair in combinations(items, 2)])
    break

for item_pair in item_pairs:
    print(item_pair)
    first
    break
len(item_pairs)
# len([a for a in combinations(b, 2)])
# [permutations(item) for item in user_and_items.values()]

(3779, 3780)


903

In [160]:
a = [(1,2,{"weight":0.14}),(2,1,{"weight":0.14}),(1,2,{"weight":0.14}),(1,3,{"weight":0.44}),(3,1,{"weight":0.44}),(2,3,{"weight":0.90}), (1,4,{"weight":0.90}), (2,4,{"weight":0.20})]

G = networkx.Graph()
G.add_edges_from(a)
# G.edges.data('weight')
[x for x in networkx.shortest_path(G, source=1, target=4)]

[1, 4]

In [151]:
G.edges

EdgeView([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4)])