In [88]:
import scipy.sparse as sps
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from Evaluation.Evaluator import EvaluatorHoldout
import scipy.sparse as sps
from collections import Counter
from tqdm import tqdm
# imports for .env usage
import os
from dotenv import load_dotenv
load_dotenv()
from Data_Handler.DataReader import DataReader

In [89]:
def dataframe_to_csr(dataframe,row_name,col_name,cell_name):
        """This method converts a dataframe object into a csr

        Args:
            dataframe (dataframe)
            row_name (str): For example, "UserID"
            col_name (str): For example, "ItemID"
            cell_name (str): For example, "Data"
        Returns:
            csr
        """
        rows = dataframe[row_name].unique()
        columns = dataframe[col_name].unique()

        shape = (len(rows), len(columns))

        # Create indices for users and items
        row_cat = CategoricalDtype(categories=sorted(rows), ordered=True)
        col_cat = CategoricalDtype(categories=sorted(columns), ordered=True)
        row_index = dataframe[row_name].astype(row_cat).cat.codes
        col_index = dataframe[col_name].astype(col_cat).cat.codes

        # Conversion via COO matrix
        coo = sps.coo_matrix(
            (dataframe[cell_name], (row_index.values, col_index.values)), shape=shape)
        csr = coo.tocsr()
        return csr

In [90]:
interactions_and_impressions = pd.read_csv(filepath_or_buffer=os.getenv('INTERACTIONS_AND_IMPRESSIONS_PATH'),
                                            sep=',',
                                            names=[
    'UserID', 'ItemID', 'Impressions', 'Data'],
    header=0,
    dtype={'UserID': np.int32, 'ItemID': np.int32, 'Impressions': np.object0, 'Data': np.int32})
interactions = interactions_and_impressions.drop(['Impressions'], axis=1)
items = interactions['ItemID'].unique()
data = interactions['Data'].unique()
users = interactions['UserID'].unique()
print(items.shape)
print(data.shape)
print(users.shape)
interactions = interactions.replace({'Data': {0: 1}})
interactions = interactions.drop_duplicates(keep='first')
urm = dataframe_to_csr(interactions,'UserID','ItemID','Data')
interactions

(24507,)
(2,)
(41629,)


Unnamed: 0,UserID,ItemID,Data
0,0,11,1
1,0,21,1
13,0,22,1
28,0,24,1
29,0,44,1
...,...,...,...
5826501,41628,20448,1
5826502,41628,20896,1
5826503,41628,21506,1
5826504,41628,22882,1


In [91]:
print(interactions.ItemID.unique().size)
print(interactions.UserID.unique().size)

24507
41629


In [92]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(urm, train_percentage = 0.5)



In [93]:
print(URM_train.get_shape)

<bound method spmatrix.get_shape of <41629x24507 sparse matrix of type '<class 'numpy.float64'>'
	with 777320 stored elements in Compressed Sparse Row format>>


In [94]:
data_icm_type = pd.read_csv(filepath_or_buffer=os.getenv('DATA_ICM_TYPE_PATH'),
                            sep=',',
                            names=[
    'item_id', 'feature_id', 'data'],
    header=0,
    dtype={'item_id': np.int32, 'feature_id': np.int32, 'data': np.int32})

features = data_icm_type['feature_id'].unique()
items = data_icm_type['item_id'].unique()
shape = (len(items), len(features))


# Create indices for users and items
features_cat = CategoricalDtype(categories=sorted(features), ordered=True)
item_cat = CategoricalDtype(categories=sorted(items), ordered=True)
features_index = data_icm_type["feature_id"].astype(features_cat).cat.codes
item_index = data_icm_type["item_id"].astype(item_cat).cat.codes
coo = sps.coo_matrix(
    (data_icm_type["data"], (item_index.values, features_index.values)), shape=shape)
ICM = coo.tocsr()


In [95]:
print(ICM)

  (0, 0)	1
  (1, 2)	1
  (2, 3)	1
  (3, 0)	1
  (4, 2)	1
  (5, 1)	1
  (6, 0)	1
  (7, 2)	1
  (8, 0)	1
  (9, 0)	1
  (10, 0)	1
  (11, 0)	1
  (12, 2)	1
  (13, 0)	1
  (14, 0)	1
  (15, 2)	1
  (16, 0)	1
  (17, 0)	1
  (18, 3)	1
  (19, 3)	1
  (20, 1)	1
  (21, 2)	1
  (22, 0)	1
  (23, 0)	1
  (24, 3)	1
  :	:
  (23066, 3)	1
  (23067, 2)	1
  (23068, 0)	1
  (23069, 2)	1
  (23070, 2)	1
  (23071, 0)	1
  (23072, 0)	1
  (23073, 0)	1
  (23074, 0)	1
  (23075, 3)	1
  (23076, 0)	1
  (23077, 0)	1
  (23078, 3)	1
  (23079, 0)	1
  (23080, 2)	1
  (23081, 0)	1
  (23082, 2)	1
  (23083, 3)	1
  (23084, 0)	1
  (23085, 3)	1
  (23086, 0)	1
  (23087, 1)	1
  (23088, 0)	1
  (23089, 0)	1
  (23090, 3)	1


In [96]:
diff = np.setdiff1d(interactions['ItemID'].unique(), data_icm_type['item_id'].unique())
print(diff)

[    9    14    23 ... 24486 24492 24499]


In [97]:
print(diff.size)

4877


In [98]:
diff_1 = np.setdiff1d(data_icm_type['item_id'].unique(), interactions['ItemID'].unique())
print(diff_1)

[24507 24508 24509 ... 27965 27966 27967]


In [99]:
print(diff_1.size)

3461


In [100]:
from Data_Handler.DataReader import DataReader
dataReader = DataReader()

In [101]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(dataReader.load_augmented_binary_urm(), train_percentage = 0.9)



In [102]:
urm=dataReader.csr_to_dataframe(URM_train)
f=dataReader.load_icm_df()
swap_list = ["feature_id", "item_id", "data"]
f = f.reindex(columns=swap_list)
f = f.rename({'feature_id': 'UserID', 'item_id': 'ItemID', 'data': 'Data'}, axis=1)

urm['Data'] = 0.825 * urm['Data']
# f times (1-aplha)
f['Data'] = 0.175 * f['Data']
# Change UserIDs of f matrix in order to make recommender work
f['UserID'] = 41634 + f['UserID']

powerful_urm = pd.concat([urm, f], ignore_index=True).sort_values(['UserID', 'ItemID'])
powerful_urm = dataReader.dataframe_to_csr(powerful_urm,'UserID', 'ItemID','Data')
print(powerful_urm)

  (0, 11)	0.825
  (0, 21)	0.825
  (0, 22)	0.825
  (0, 24)	0.825
  (0, 54)	0.825
  (0, 124)	0.825
  (0, 239)	0.825
  (0, 575)	0.825
  (0, 751)	0.825
  (0, 987)	0.825
  (0, 1326)	0.825
  (0, 1715)	0.825
  (0, 2218)	0.825
  (0, 2256)	0.825
  (0, 2257)	0.825
  (0, 2292)	0.825
  (0, 2730)	0.825
  (0, 4007)	0.825
  (0, 4047)	0.825
  (0, 4383)	0.825
  (0, 5336)	0.825
  (0, 5735)	0.825
  (0, 5752)	0.825
  (0, 6351)	0.825
  (0, 7301)	0.825
  :	:
  (41632, 27866)	0.175
  (41632, 27870)	0.175
  (41632, 27873)	0.175
  (41632, 27877)	0.175
  (41632, 27880)	0.175
  (41632, 27883)	0.175
  (41632, 27889)	0.175
  (41632, 27896)	0.175
  (41632, 27900)	0.175
  (41632, 27907)	0.175
  (41632, 27911)	0.175
  (41632, 27917)	0.175
  (41632, 27922)	0.175
  (41632, 27942)	0.175
  (41632, 27943)	0.175
  (41632, 27952)	0.175
  (41632, 27955)	0.175
  (41632, 27960)	0.175
  (41632, 27962)	0.175
  (41632, 27967)	0.175
  (41633, 97)	0.175
  (41633, 14789)	0.175
  (41633, 16522)	0.175
  (41633, 20523)	0.175
  (41633, 

In [103]:
pad_items_ids = np.setdiff1d(dataReader.load_augmented_binary_urm_df()['ItemID'].unique(), dataReader.load_icm_df()['item_id'].unique())
print(pad_items_ids.size)
feature_ids = dataReader.load_icm_df()['feature_id'].unique()
print(feature_ids)

col = []
for feature_id in feature_ids:
    for item_id in range(len(pad_items_ids)):
        col.append(feature_id)
col = np.array(col,dtype=np.int32)

row=[]
for item_id in pad_items_ids:
    for feature_id in range(len(feature_ids)):
        row.append(item_id)
row=np.array(row,dtype=np.int32)

'''data = np.zeros((row.size),dtype=np.int32)
print(data.size)

print(np.unique(row))
print(np.unique(col))

padICM = sps.csr_matrix((data,(row,col)))
print(padICM)

print("icm",dataReader.load_icm().get_shape())
print(dataReader.load_icm().dtype.name)

print("pad",padICM.get_shape())
print(padICM.dtype.name)
paddedICM = sps.vstack([dataReader.load_icm(), padICM])
#result=sps.vstack(URM_train, paddedICM.T)'''

4877
[1 3 4 2 7]


'data = np.zeros((row.size),dtype=np.int32)\nprint(data.size)\n\nprint(np.unique(row))\nprint(np.unique(col))\n\npadICM = sps.csr_matrix((data,(row,col)))\nprint(padICM)\n\nprint("icm",dataReader.load_icm().get_shape())\nprint(dataReader.load_icm().dtype.name)\n\nprint("pad",padICM.get_shape())\nprint(padICM.dtype.name)\npaddedICM = sps.vstack([dataReader.load_icm(), padICM])\n#result=sps.vstack(URM_train, paddedICM.T)'

In [104]:
dataReader = DataReader()


In [128]:
urm=dataReader.csr_to_dataframe(URM_train)
icm=dataReader.load_icm_df()
DiffURM_ICM = np.setdiff1d(urm['ItemID'].unique(), icm['item_id'].unique())
DiffICM_URM = np.setdiff1d( icm['item_id'].unique(), urm['ItemID'].unique())
print(DiffURM_ICM.size)

4877


In [133]:
for id in DiffURM_ICM:
    icm.loc[len(icm.index)] = [id, 1, 0]

In [134]:
a = icm['item_id'].unique()
print(a.size)

27968


In [135]:
sorted_icm = icm.sort_values('item_id').reset_index(drop= True)

In [136]:
sorted_icm.to_csv('/Users/francescomattioli/Desktop/newicm.csv')

In [137]:
for id in DiffICM_URM:
    urm.loc[len(urm.index)] = [1, id, 0]

In [138]:
b = urm['ItemID'].unique()
print(b.size)

27968


In [139]:
sorted_urm = urm.sort_values('UserID').reset_index(drop= True)

In [140]:
sorted_icm.to_csv('/Users/francescomattioli/Desktop/newurm.csv')

In [148]:
URM = dataReader.dataframe_to_csr(sorted_urm, 'UserID', 'ItemID', 'Data')
ICM = dataReader.dataframe_to_csr(sorted_icm, 'item_id', 'feature_id', 'data')

In [150]:
#print(URM)
print(ICM)

  (0, 0)	1
  (1, 2)	1
  (2, 3)	1
  (3, 0)	1
  (4, 2)	1
  (5, 1)	1
  (6, 0)	1
  (7, 2)	1
  (8, 0)	1
  (9, 0)	0
  (10, 0)	1
  (11, 0)	1
  (12, 0)	1
  (13, 2)	1
  (14, 0)	0
  (15, 0)	1
  (16, 0)	1
  (17, 2)	1
  (18, 0)	1
  (19, 0)	1
  (20, 3)	1
  (21, 3)	1
  (22, 1)	1
  (23, 0)	0
  (24, 0)	0
  :	:
  (27943, 3)	1
  (27944, 2)	1
  (27945, 0)	1
  (27946, 2)	1
  (27947, 2)	1
  (27948, 0)	1
  (27949, 0)	1
  (27950, 0)	1
  (27951, 0)	1
  (27952, 3)	1
  (27953, 0)	1
  (27954, 0)	1
  (27955, 3)	1
  (27956, 0)	1
  (27957, 2)	1
  (27958, 0)	1
  (27959, 2)	1
  (27960, 3)	1
  (27961, 0)	1
  (27962, 3)	1
  (27963, 0)	1
  (27964, 1)	1
  (27965, 0)	1
  (27966, 0)	1
  (27967, 3)	1
