In [14]:
import os
import numpy as np
from caserec.utils.process_data import ReadFile, ReadDataframe
import pandas as pd 

### Ratings 

In [9]:
train_file = './../../../../Datasets/MovieLens/100k_raw/u.data'
dict_ratings_from_file = ReadFile(train_file, sep='\t').read() 
dict_ratings_from_file.keys()

In [17]:
ReadFile(train_file, sep='\t').read_like_triple()[:5] 

[(196, 242, 3.0),
 (186, 302, 3.0),
 (22, 377, 1.0),
 (244, 51, 2.0),
 (166, 346, 1.0)]

In [22]:
df_ratings = pd.read_csv(train_file, sep='\t', header=None, names = ['user', 'item', 'feedback_value', 'timestamp'])
df_ratings.head()

Unnamed: 0,user,item,feedback_value,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [24]:
ReadDataframe(df_ratings).read_like_triple()[:10]

[[196, 242, 3],
 [186, 302, 3],
 [22, 377, 1],
 [244, 51, 2],
 [166, 346, 1],
 [298, 474, 4],
 [115, 265, 2],
 [253, 465, 5],
 [305, 451, 3],
 [6, 86, 3]]

In [None]:
def read(df_feedback):
    dict_feedback = {}
    items_unobserved = {}
    items_seen_by_user = {}
    users_viewed_item = {}
    as_binary = False
    list_users = df_feedback['user_id'].unique()
    list_items = df_feedback['item_id'].unique()

    number_interactions = df_feedback.shape[0]

    for user_id in list_users:
        items = df_feedback[df_feedback['user_id'] == user_id]['item_id']
        values = df_feedback[df_feedback['user_id'] == user_id]['feedback'].apply(lambda x: 1.0 if as_binary else x)
        dict_feedback[user_id] = dict(zip(items, values))    
        items_seen_by_user[user_id] = list(items)

    for item_id in df_feedback['item_id'].unique():
        users = df_feedback[df_feedback['item_id'] == item_id]['user_id']
        users_viewed_item[item_id] = list(users)

    list_users = sorted(list(list_users))
    list_items = sorted(list(list_items))

    # Create a dictionary with unobserved items for each user / Map user with its respective id
    for user in list_users:
        items_unobserved[user] = list(set(list_items) - set(items_seen_by_user[user]))

    # Calculate the sparsity of the set: N / (nu * ni)
    sparsity = (1 - (number_interactions / float(len(list_users) * len(list_items)))) * 100    

    dict_file = {
            'feedback': dict_feedback,
            'users': list_users,
            'items': list_items,
            'sparsity': sparsity,
            'number_interactions': number_interactions,
            'users_viewed_item': users_viewed_item,
            'items_unobserved': items_unobserved,
            'items_seen_by_user': items_seen_by_user,
            'mean_value': np.mean(df_feedback['feedback']),
            'max_value': np.max(df_feedback['feedback']),
            'min_value': np.min(df_feedback['feedback']),
         }
    
    return dict_file

In [None]:
# read(df_feedback)

In [None]:
def read_like_triple(df_feedback):
    """
    Method to return information in the file as a triple. eg. (user, item, value)

    :return: List with triples in the file
    :rtype: list

    """
    return df_feedback.values.tolist()

In [None]:
read_like_triple(df_feedback)

### Metadata

In [40]:
genre_file = './../../../../Datasets/MovieLens/100k_raw/u.user'
df_genre_from_file = pd.read_csv(genre_file, sep='|', header=None, names = ['user', 'age', 'sex', 'occupation', 'zipcode'])
df_genre_from_file.head()

Unnamed: 0,user,age,sex,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [44]:
for index, row in df_genre_from_file.head(4).iterrows():
    print (len(row), row[0], row[1])

5 1 24
5 2 53
5 3 23
5 4 24


In [34]:
list_users = df_genre_from_file['user'].unique()
list_users[:5]

array([1, 2, 3, 4, 5], dtype=int64)

In [37]:
df_genre_from_file.set_index(keys=['user'], verify_integrity=True, inplace=True)
df_genre_from_file.head()

Unnamed: 0_level_0,age,sex,occupation,zipcode
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [38]:
dict_values = {}
list_col_1 = set()
list_col_2 = set()
mean_value = 0
number_interactions = 0

In [None]:
number_interactions = df_genre_from_file.shape[0]

for index, row in df_genre_from_file.iterrows():
    if len(row) == 1:
        raise TypeError("Error: Space type (sep) is invalid!")

    if len(row) == 2:
        attr1, attr2 = int(row[0]), row[1]
        dict_values.setdefault(attr1, {}).update({attr2: 1.0})
        list_col_1.add(attr1)
        list_col_2.add(attr2)        
    else:
        attr1, attr2, value = int(row[0]), row[1], float(row[2])
        dict_values.setdefault(attr1, {}).update({attr2: 1.0 if self.as_binary else value})
        list_col_1.add(attr1)
        list_col_2.add(attr2)
        mean_value += value
        


In [None]:
def read_similarity_or_metadata(df, names=None):    
    list_items = self.df_feedback['item'].unique()
    pd.read_csv(genre_file, sep='|', header=None, names = ['user', 'age', 'sex', 'occupation', 'zipcode'])