In [1]:
import pandas as pd
import numpy as np
import os
from Data_Handler.DataReader import DataReader
from collections import Counter

In [2]:
f = open("Dataset/binary_impressions_ICM.csv", "w+")
f.write("ItemID,FeatureID,Data\n")

"""
Return a dictionary of dictionaries. For each UserID there is a dictionary of ItemIDs as keys and a number, corresponding to 
how many times that ItemID has been presented to the given UserID, as values.

Args:
    target (int): UserIDs on which count ItemIDs presentations occurences
    items (numpy.array): ItemIDs on which count presentations occurences
Returns:
    dict: dictionary of dictionaries, for instance { user0:{item0:2, item1:23}, user1:{item2:11, item4:3} }
"""

df = pd.read_csv(filepath_or_buffer=os.getenv('INTERACTIONS_AND_IMPRESSIONS_PATH'),
                    sep=',',
                    names=[
    'UserID', 'ItemID', 'Impressions', 'Data'],
    header=0,
    dtype={'UserID': np.int32, 'ItemID': np.int32, 'Impressions': np.object0, 'Data': np.int32})
items = df['ItemID'].unique()
df = df.drop(['UserID'], axis=1)
df = df.drop(['Data'], axis=1)
df = df.dropna()
# add a comma at the end of each impression string in order to concat properly then
df['Impressions'] = df['Impressions'].apply(lambda x: str(x)+',')
df = df.groupby(['ItemID'], as_index=False)
# to concat impressions of each user
impressions_per_item = df['Impressions'].apply(sum)


# Create file
for item in items:
    impressions = impressions_per_item[impressions_per_item['ItemID']== item]['Impressions']
    if(impressions.size>0):
        # Get impressions string for the item in the loop
        impressions= impressions.iloc[0]
        # Split the impressions strings
        impressions= impressions.split(",")
        # Remove last element which is a '' due to last ','
        impressions= impressions[:-1]
        # Convert into list of integers
        impressions = [int(x) for x in impressions]
        # Remove item itself because it is present in every list impressions
        while(item in impressions):
            impressions.remove(item)

        # Remove duplicates before inserting into the file
        for element in impressions:
            while impressions.count(element) > 1:
                impressions.remove(element)

        # For each item and for each impression associated with
        # Insert a line in the file with the number of impression occurences weighted by the maximum number of an impression occurence
        for item_impression in impressions:
            f.write(f"{item},{item_impression},{1}\n")