In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_csv('../Data/ml100/movieLens_ratings.csv', sep=',', header=0)
data = data.drop(columns='timestamp')

In [3]:
data = data[data['rating'] >= 3]

In [4]:
data['item_id'] = data['item_id'].apply(str)
data = data.groupby('user_id')['item_id'].apply(list).reset_index(name='item_id')

In [5]:
user_data = pd.read_csv('../Data/ml100/movieLens_users.csv', sep=',', header=0)

In [6]:
# ids de usuarios por genero
male_users = user_data[user_data['gender'] == 'M']['user_id'].to_list()
female_users = user_data[user_data['gender'] == 'F']['user_id'].to_list()

In [7]:
def lista_id_user_by_age(age):
    return user_data[user_data['age'] == age]['user_id'].to_list()

In [8]:
# ids de usuarios por edad
dict_age = {1:  "Under_18", 18:  "18-24", 25:  "25-34", 35:  "35-44", 
            45:  "45-49",50:  "50-55",56:  "56+"}
id_users_by_age = [lista_id_user_by_age(age) for age in sorted(dict_age.keys())]

In [9]:
train = pd.DataFrame(columns = ['user_id', 'item_id'])
test = pd.DataFrame(columns = ['user_id', 'item_id'])

In [10]:
data

Unnamed: 0,user_id,item_id
0,1,"[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
1,2,"[1357, 3068, 1537, 647, 2194, 648, 2268, 2628,..."
2,3,"[3421, 648, 1394, 3534, 104, 2735, 1210, 1431,..."
3,4,"[3468, 1210, 2951, 1214, 1036, 260, 2028, 480,..."
4,5,"[2987, 2333, 1175, 39, 2337, 1535, 1392, 1466,..."
...,...,...
6034,6036,"[571, 574, 2054, 589, 6, 3006, 3008, 1405, 140..."
6035,6037,"[589, 3006, 1407, 2064, 2065, 593, 3015, 903, ..."
6036,6038,"[1419, 920, 3088, 232, 1136, 1148, 1183, 2146,..."
6037,6039,"[588, 2067, 1416, 3022, 3028, 2080, 2083, 2087..."


In [11]:
# Dividir dataset 90% train, 10% test
for row_d in data.iterrows():
    row = list(row_d[1])
    if len(row[1]) <= 1:
        new_row = {'user_id': row_d[0], 'item_id': row[1]}
        train = train.append(new_row, ignore_index=True)
        test = test.append(new_row, ignore_index=True)
    else:
        items_train = row[1][:int(len(row[1])*0.9)]
        items_test = row[1][int(len(row[1])*0.9):]
        new_row_train = {'user_id': row_d[0], 'item_id': items_train}
        new_row_test = {'user_id': row_d[0], 'item_id': items_test}
        train = train.append(new_row_train, ignore_index=True)
        test = test.append(new_row_test, ignore_index=True)

In [12]:
# Separar test por genero
test_M = test[test['user_id'].isin(male_users)]
test_F =  test[test['user_id'].isin(female_users)]

In [13]:
# Separar test por edad
test_users_by_age = [test[test['user_id'].isin(age_list)] for age_list in id_users_by_age]

In [14]:
# Guardar archivo
def guardar_archivo(nombre, data, train, tipo='test'):
    if not os.path.isdir(f"../Data/ml100-{nombre}"):
        path = os.path.join('../Data/', f"ml100-{nombre}")
        os.mkdir(path)
    file = open(f"../Data/ml100-{nombre}/{tipo}.txt", 'w+')
    file.write('0 0\n')
    for index, row in data.iterrows():
        line = str(row['user_id']), ' '.join(map(str,row['item_id'])),'\n'
        line = ' '.join(line)
        file.write(line)
    file.close()
    file = open(f"../Data/ml100-{nombre}/train.txt", 'w+')
    file.write('0 0\n')
    for index, row in train.iterrows():
        line = str(row['user_id']), ' '.join(map(str,row['item_id'])),'\n'
        line = ' '.join(line)
        file.write(line)
    file.close()

In [15]:
# Guardar train
guardar_archivo('train_all', train, train, 'train')

In [16]:
# Guardar test, y tests por genero
guardar_archivo('test_all', test, train)
guardar_archivo('test_M', test_M, train)
guardar_archivo('test_F', test_F, train)

In [17]:
# Guardar test por edad
for x, y in (zip(sorted(dict_age.keys()), test_users_by_age)):
    guardar_archivo(f"test_{x}_age", y, train)