In [1]:
import pandas as pd
from scipy.sparse import coo_matrix

lfm = pd.read_table('usersha1-artmbid-artname-plays.tsv',
                    sep='\t', header=None,
                    names=['UserId','ItemId', 'Artist','Count'])
lfm.head(3)

Unnamed: 0,UserId,ItemId,Artist,Count
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897


In [2]:
lfm = lfm.drop('Artist', axis=1)
lfm = lfm.loc[(lfm.Count > 0) & (lfm.UserId.notnull()) & (lfm.ItemId.notnull())]
lfm['UserId'] = pd.Categorical(lfm.UserId).codes
lfm['ItemId'] = pd.Categorical(lfm.ItemId).codes
lfm.head(3)

Unnamed: 0,UserId,ItemId,Count
0,0,37425,2137
1,0,152039,1099
2,0,112365,897


In [3]:
X = coo_matrix((lfm.Count, (lfm.UserId, lfm.ItemId)))
X

<358858x160112 sparse matrix of type '<class 'numpy.int64'>'
	with 17309518 stored elements in COOrdinate format>

In [4]:
import recometrics

X_train, X_test, users_test = \
    recometrics.split_reco_train_test(
        X, split_type = "joined",
        users_test_fraction = None,
        max_test_users = 10000,
        items_test_fraction = 0.3,
        min_pos_test = 2,
        min_items_pool = 10,
        seed = 123
    )
X_train = X_train.tocoo()
X_test = X_test.tocoo()

df_train = pd.DataFrame({
    "user_id" : X_train.row,
    "item_id" : X_train.col,
    "counts" : X_train.data,
})
df_test = pd.DataFrame({
    "user_id" : X_test.row,
    "item_id" : X_test.col,
    "counts" : X_test.data,
})
df_all = pd.DataFrame({
    "user_id" : X.row,
    "item_id" : X.col,
    "counts" : X.data,
})

print(df_all.shape)
print(df_train.shape)
print(df_test.shape)

(17309518, 3)
(17164027, 3)
(145124, 3)


In [5]:
df_all.to_csv("lfm_all.csv", index=False)
df_train.to_csv("lfm_train.csv", index=False)
df_test.to_csv("lfm_test.csv", index=False)