## Information Extractor

In [19]:
import pandas as pd

def extract(paths):
    information_table = "Filename\t\tUsers\tItems\tRatings\tSparsity\n"
    filenames = []
    n_instances = []
    unique_ratings = []
    n_users = []
    l_users = []
    n_items = []
    l_items = []
    sparsities = []
    min_ratings = []
    max_ratings = []
    avg_ratings = []
    time_periods = []

    
    for path in paths:
        users = []
        items = []
        ratings = []
        timestamps = []

        print('Reading file {}'.format(path))
        with open(path, 'r') as f:
            for index, line in enumerate(f):
                line = line.replace("\n", "")
                if line == "":
                    continue
                user, item, rating, timestamp = line.split(",")
                users.append(int(user))
                items.append(int(item))
                ratings.append(int(rating))
                timestamps.append(int(timestamp))
        df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
        filename = path.split("\\")[-1]
        filenames.append(filename)
        
        n_instances.append(df.shape[0])
        
        users = df.groupby("User").count().index.values
        users.sort()
        n_user = users.shape[0]
        
        l_users.append(users[-1])
        n_users.append(n_user)
        
        items = df.groupby("Item").count().index.values
        items.sort()
        n_item = items.shape[0]
        
        l_items.append(items[-1])
        n_items.append(n_item)
        
        min_ratings.append(df.groupby("User").count()["Item"].min())
        max_ratings.append(df.groupby("User").count()["Item"].max())
        avg_ratings.append(int(df.groupby("User").count()["Item"].mean()))
        
        
        unique_rating = df.groupby(["User", "Item"]).count().shape[0]
        print("UR: {} EXP: {} User: {} Item: {} CR: {:.2f}%".format(unique_rating, n_user*n_item, n_user, n_item, (1 - unique_rating/(n_user*n_item))*100))
        sparsity = (1 - unique_rating/(n_user*n_item)) * 100
        sparsities.append(sparsity)
        unique_ratings.append(unique_rating/df.shape[0]*100)
    return pd.DataFrame({"Filename":filenames, "Size": n_instances, "Unique Rating": unique_ratings, "Users":n_users, "Last User ID": l_users, "Items":n_items, "Last Item ID": l_items, "Avg Rate":avg_ratings, "Min Rate":min_ratings, "Max Rate":max_ratings, "Sparsity":sparsities}).sort_values(by=["Size"])     
    
paths = [
    "D:\\recsys\\datasets\\MovieLens\\ml-1m-gte.csv",
    "D:\\recsys\\datasets\\MovieLens\\ml-100k-gte.csv",
    "D:\\recsys\\datasets\\palco\\music_playlist.csv",
    "D:\\recsys\\datasets\\palco\\music_listen.csv",
    "D:\\recsys\\datasets\\CiaoDVD\\ciaodvd-gte.csv",
    "D:\\recsys\\datasets\\EachMovie\\eachmovie-gte.csv",
    "D:\\recsys\\datasets\\MovieTweetings\\movietweetings-gte.csv",
    "D:\\recsys\\datasets\\YELP\\yelp_dataset\\yelp-gte.csv",
    "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm.csv",
    "D:\\recsys\\datasets\\Netflix\\netflix-gte.csv"
]

extract(paths)

Reading file D:\recsys\datasets\MovieLens\ml-1m-gte.csv
UR: 226310 EXP: 19437248 User: 6014 Item: 3232 CR: 98.84%
Reading file D:\recsys\datasets\MovieLens\ml-100k-gte.csv
UR: 21201 EXP: 1087616 User: 928 Item: 1172 CR: 98.05%
Reading file D:\recsys\datasets\palco\music_playlist.csv
UR: 111927 EXP: 271407864 User: 10392 Item: 26117 CR: 99.96%
Reading file D:\recsys\datasets\palco\music_listen.csv
UR: 784360 EXP: 1023943619 User: 25463 Item: 40213 CR: 99.92%
Reading file D:\recsys\datasets\CiaoDVD\ciaodvd-gte.csv
UR: 32530 EXP: 105705108 User: 12508 Item: 8451 CR: 99.97%
Reading file D:\recsys\datasets\EachMovie\eachmovie-gte.csv
UR: 511614 EXP: 84075740 User: 54068 Item: 1555 CR: 99.39%
Reading file D:\recsys\datasets\MovieTweetings\movietweetings-gte.csv
UR: 14814 EXP: 20288368 User: 4856 Item: 4178 CR: 99.93%
Reading file D:\recsys\datasets\YELP\yelp_dataset\yelp-gte.csv
UR: 2641878 EXP: 172325625530 User: 1033561 Item: 166730 CR: 100.00%
Reading file D:\recsys\datasets\LastFM\lastfm

Unnamed: 0,Filename,Size,Unique Rating,Users,Last User ID,Items,Last Item ID,Avg Rate,Min Rate,Max Rate,Sparsity
6,movietweetings-gte.csv,14842,99.811346,4856,24899,4178,15131,3,1,83,99.926983
1,ml-100k-gte.csv,21201,100.0,928,943,1172,1656,22,1,172,98.050691
4,ciaodvd-gte.csv,32695,99.495336,12508,17615,8451,16119,2,1,306,99.969226
2,music_playlist.csv,111942,99.9866,10392,88921,26117,126623,10,1,2430,99.958761
0,ml-1m-gte.csv,226310,100.0,6014,6040,3232,3952,37,1,571,98.835689
5,eachmovie-gte.csv,511667,99.989642,54068,74424,1555,1648,9,1,448,99.391484
3,music_listen.csv,1466893,53.470839,25463,25462,40213,40212,57,1,61714,99.923398
7,yelp-gte.csv,2641880,99.999924,1033561,1518168,166730,188592,2,1,463,99.998467
8,last-fm.csv,19150868,24.115309,992,1000,1500661,1500660,19305,2,183103,99.689768
9,netflix-gte.csv,23168232,100.0,17755,17770,463616,480188,1304,1,96535,99.718542


## Optimize Data

In [22]:
# Since the framework use the user index to access the matrix, the best way to do it is maintaining close to zero, without users/items gaps
# This function remap every user and item to be as close as possible to zero. 

import numpy as np

def optimizer(paths):
    optimize_path = "D:\\recsys\\datasets\\optimized\\{}"
    for path in paths:
        
        users = []
        items = []
        ratings = []
        timestamps = []

        print('Reading file {}'.format(path))
        with open(path, 'r') as f:
            for index, line in enumerate(f):
                line = line.replace("\n", "")
                if line == "":
                    continue
                user, item, rating, timestamp = line.split(",")
                users.append(int(user))
                items.append(int(item))
                ratings.append(int(rating))
                timestamps.append(int(timestamp))
        df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
        filename = path.split("\\")[-1]
        
        print("Grouping users and items ... ", end="")
        items = df.groupby("Item").count().index.values
        users = df.groupby("User").count().index.values
        item_dict = {}
        user_dict = {}
        print("OK")


        print("Adding to a dictionary ... " , end="")
        for index, item in enumerate(items):
            item_dict[item] = index
        for index, user in enumerate(users):
            user_dict[user] = index
        print("OK")     
        
        print('Writing csv file ... ', end="")
        with open(optimize_path.format(filename), 'w+') as f:
            for index, row in df.iterrows():
                f.write("{},{},{},{}\n".format(user_dict[row['User']], item_dict[row['Item']], row['Rating'], row['Timestamp']))
        print("OK")
        
paths = [
    "D:\\recsys\\datasets\\MovieLens\\ml-1m-gte.csv",
    "D:\\recsys\\datasets\\MovieLens\\ml-100k-gte.csv",
    "D:\\recsys\\datasets\\palco\\music_playlist.csv",
    "D:\\recsys\\datasets\\palco\\music_listen.csv",
    "D:\\recsys\\datasets\\CiaoDVD\\ciaodvd-gte.csv",
    "D:\\recsys\\datasets\\EachMovie\\eachmovie-gte.csv",
    "D:\\recsys\\datasets\\MovieTweetings\\movietweetings-gte.csv",
    "D:\\recsys\\datasets\\YELP\\yelp_dataset\\yelp-gte.csv",
    "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm.csv",
    "D:\\recsys\\datasets\\Netflix\\netflix-gte.csv"
]

optimizer(paths)

Reading file D:\recsys\datasets\MovieLens\ml-1m-gte.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\MovieLens\ml-100k-gte.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\palco\music_playlist.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\palco\music_listen.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\CiaoDVD\ciaodvd-gte.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\EachMovie\eachmovie-gte.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\MovieTweetings\movietweetings-gte.csv
Grouping users and items ... OK
Adding to a dictionary ... O

## K-Sampling

In [None]:
def k_sampling(data, k):
    size = data.shape[0]

    st = data.loc[[e for e in range(size) if e % k == 0]]
    
    users = st.groupby("User").count().sort_values(by=["Timestamp"], ascending = False)["Item"]
    user_size = users.shape[0]
    selected_users = users.iloc[[e for e in range(user_size) if e % k == 0  ]].index.values

    items = st.groupby("Item").count().sort_values(by=["Timestamp"], ascending = False)["User"]
    item_size = items.shape[0]
    selected_items = items.iloc[[e for e in range(item_size) if e % k == 0  ]].index.values
    
    index_list = []
    for index, row in data.iterrows():
        if row["User"] in selected_users and row["Item"] in selected_items:
            index_list.append(index)
    return data.iloc[index_list].copy()

In [None]:
import pandas as pd

def generate_k_samples(path, k_values):
    base = path.split(".")[0]
    out = base + "-K{}.csv" 
    users = []
    items = []
    ratings = []
    timestamps = []
    print('Reading file {} .. '.format(path), end="")
    with open(path, 'r') as f:
        for index, line in enumerate(f):
            line = line.replace("\n", "")
            if line == "":
                continue
            user, item, rating, timestamp = line.split(",")
            users.append(int(user))
            items.append(int(item))
            ratings.append(int(rating))
            timestamps.append(int(timestamp))
    df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
    print("OK")
    
    print("Starting K-sampling ..")
    for k_value in k_values:
        print("K{} data sampling ..".format(k_value), end="")
        st = k_sampling(df, k_value)
        print("OK")
        print("Writing file .. ", end="")
        with open(out.format(k_value), 'w+') as f:
            for index, row in st.iterrows():
                f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
        print("OK")
        del st
    
        

In [24]:
data_paths = [
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\ciaodvd-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\eachmovie-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\ml-1m-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\ml-100k-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\movietweetings-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\music-playlist.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\yelp-gte.csv",
    "D:\\recsys\\datasets\\Experimentation Protocol\\K_Sampling\\last-fm.csv"
]

k_values = list(range(2, 15))

# for path in data_paths:
#     generate_k_samples(path, k_values)

k_samples_path = []

for path in data_paths:
    for k_value in k_values:
        k_samples_path.append("{}-K{}.csv".format(path.split(".")[0], k_value))
        
optimizer(k_samples_path)

Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K2.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K3.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K4.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K5.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K6.csv
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete
Reading file D:\recsys\datasets\Experimentation Protocol\K_Sampling\ciaodvd-gte-K7.csv
Grouping users and items ... OK
Adding

## MovieLens 1M

In [None]:
import pandas as pd

path = "D:\\recsys\\datasets\\MovieLens\\ml-1m\\ratings.dat"
output = "D:\\recsys\\datasets\\MovieLens\\ml-1m\\ml-1m-K{}.csv"
output_positive_only = "D:\\recsys\\datasets\\MovieLens\\ml-1m\\ml-1m-K{}-gte.csv"

users = []
items = []
ratings = []
timestamps = []
k_value = 2


print('Reading file {}'.format(path))
with open(path, 'r') as f:
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, rating, timestamp = line.split("::")
        users.append(int(user))
        items.append(int(item))
        ratings.append(int(rating))
        timestamps.append(int(timestamp))
    df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
    
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('K{}-Sampling'.format(k_value))
df = k_sampling(df, k_value)

print('Writing csv file')
with open(output.format(k_value), 'w+') as f:
    with open(output_positive_only.format(k_value), "w+") as f_positive:
        for index, row in df.iterrows():
            f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
            if row['Rating'] == 5:
                f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Complete")


## MovieLens 100k

In [None]:
import pandas as pd

path = "D:\\recsys\\datasets\\MovieLens\\ml-100k\\ml-100k\\u.data"
output = "D:\\recsys\\datasets\\MovieLens\\ml-100k\\ml-100k.csv"
output_positive_only = "D:\\recsys\\datasets\\MovieLens\\ml-100k\\ml-100k-gte.csv"

users = []
items = []
ratings = []
timestamps = []

print('Reading file {}'.format(path))
with open(path, 'r') as f:
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, rating, timestamp = line.split("\t")
        users.append(int(user))
        items.append(int(item))
        ratings.append(int(rating))
        timestamps.append(int(timestamp))
    df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
    
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('Optimizing indexes')
df = optimize(df)

print('Writing csv file')
with open(output, 'w+') as f:
    with open(output_positive_only, "w+") as f_positive:
        for index, row in df.iterrows():
            f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
            if row['Rating'] == 5:
                f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Complete")



## Palco Principal - Music-listen

In [9]:
import pandas as pd
from datetime import datetime


k_value = 2
path = "D:\\recsys\\datasets\\palco\\"
music_listen = "{}music_listen.csv".format(path, k_value)


users = []
items = []
timestamps = []


for file in ['listenedtracks1.tsv', 'listenedtracks2.tsv']:
    print('Reading file {}'.format(path + file))
    with open("{}{}".format(path, file), 'r') as f:
        next(f)
        for index, line in enumerate(f):
            line = line.replace("\n", "")
            if line == "":
                continue
            user, item, dt = line.split("\t")
            
            user = int(user)
            users.append(user)
            items.append(int(item))

            dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
            timestamps.append(int(datetime.timestamp(dt)))
            
df = pd.DataFrame({"User": users, "Item": items, "Timestamp": timestamps})
    
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print("Grouping users and items ... ", end="")
items = df.groupby("Item").count().index.values
users = df.groupby("User").count().index.values
item_dict = {}
user_dict = {}
print("OK")

    
print("Adding to a dictionary ... " , end="")
for index, item in enumerate(items):
    item_dict[item] = index

for index, user in enumerate(users):
    user_dict[user] = index
print("OK")        

del users
del items
del timestamps

# print('K{}-Sampling'.format(k_value))
# df = k_sampling(df,k_value)

print('Writing csv file')
with open(music_listen, 'w+') as f:
    for index, row in df.iterrows():
        f.write("{},{},1,{}\n".format(user_dict[row['User']], item_dict[row['Item']], row['Timestamp']))
print("Complete")

Reading file D:\recsys\datasets\palco\listenedtracks1.tsv
Reading file D:\recsys\datasets\palco\listenedtracks2.tsv
Sorting by timestamp
Grouping users and items ... OK
Adding to a dictionary ... OK
Writing csv file
Complete


In [4]:
items = df.groupby("Item").count().index.values
users = df.groupby("User").count().index.values

In [8]:
users[-1]

73031

## Palco Principal - Music-playlist

In [None]:
import pandas as pd
from datetime import datetime

path = "D:\\recsys\\datasets\\palco\\"
music_playlist = "{}music_playlist.csv".format(path)

users = []
items = []
timestamps = []



print('Reading file {}'.format(path + "playlistedtracks.tsv"))
with open(path + "playlistedtracks.tsv", 'r') as f:
    next(f)
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, dt = line.split("\t")
        users.append(int(user))
        items.append(int(item))
        dt = dt.split(" ")
        dt = dt[0].split("-") + dt[1].split(":")[:2]
        dt = list(map(int, dt))
        dt = datetime(dt[0], dt[1], dt[2], dt[3], dt[4])
        ts = datetime.timestamp(dt)
        timestamps.append(int(ts))
df = pd.DataFrame({"User": users, "Item": items, "Timestamp": timestamps})
    
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('Optimizing indexes')
df = optimize(df)


# print('Writing csv file')
# with open(music_playlist, 'w+') as f:
#     for index, row in df.iterrows():
#         f.write("{},{},{},{}\n".format(row['User'], row['Item'], 1, row['Timestamp']))
# print("Complete")

## Netflix

In [18]:
from datetime import datetime
import pandas as pd
import numpy as np
from IPython.display import clear_output
from time import time 

path = "D:\\recsys\\datasets\\netflix\\"
output = "D:\\recsys\\datasets\\netflix\\netflix.csv"
output_positive = "D:\\recsys\\datasets\\netflix\\netflix-gte.csv"

users = []
items = []
ratings = []
timestamps = []
counter = 0
k_values = [20, 15, 10]
print_timer = time()
for file in ['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']:
    header = 'Reading file {}'.format(path + file)
    with open("{}{}".format(path, file), 'r') as f:
        for index, line in enumerate(f):
            counter += 1
            line = line.replace("\n", "")
            if line == "":
                continue
            if line[-1] == ":":
                user = int(line.replace(':',''))
                continue
            item, rating, date = line.split(',')
            if time() - print_timer > 2:
                print_timer = time()
                clear_output()
                print(header)
                print("Index: {} Completed: {:.2f}%".format(counter, (counter + 1)/100498277 * 100))
            item = int(item)
            rating = int(rating)
            users.append(user)
            items.append(item)
            ratings.append(rating)
            timestamp = int(datetime.timestamp(datetime.strptime(date, "%Y-%m-%d")))
            timestamps.append(timestamp)

df = pd.DataFrame({"User": users, "Item": items, "Rating" : ratings, "Timestamp": timestamps})


clear_output()
print("Index: {} Completed: {:.2f}%".format(counter, (counter + 1)/100498277 * 100))
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print("Grouping items ... ", end="")
items = df.groupby("Item").count().index.values
item_dict = {}
print("OK")
        
print("Adding to a dictionary ... " , end="")
for index, item in enumerate(items):
    item_dict[item] = index
print("OK")        

del users
del items
del timestamps
del ratings

print_timer = time()
counter = 0
print("Writing csv files ... ", end="")
with open(output, 'w+') as out:
    with open(output_positive, "w+") as f_positive:
        for index, row in df.iterrows():
            counter += 1
            if time() - print_timer > 2:
                print_timer = time()
                clear_output()
                print("Writing csv files ... ")
                print("Index: {} Completed: {:.2f}%".format(counter, (counter + 1)/100498277 * 100))

            item = item_dict[row["Item"]]
            out.write("{},{},{},{}\n".format(row["User"], item, row["Rating"], row["Timestamp"]))
            if row["Rating"] == 5:
                f_positive.write("{},{},1,{}\n".format(row["User"], item, row["Timestamp"]))
clear_output()
print("Writing csv files ... ")
print("Index: {} Completed: {:.2f}%".format(counter, (counter + 1)/100498277 * 100))
print("Completed")

Writing csv files ... 
Index: 100480507 Completed: 99.98%
Completed


In [7]:
# items = df.groupby("Item").count().index.values
users = df.groupby("User").count().index.values

In [11]:
df["Item"].dtype

dtype('int64')

## LastFM

In [None]:
import pandas as pd
import hashlib
from datetime import datetime
from IPython.display import clear_output
from time import time

path = "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\userid-timestamp-artid-artname-traid-traname.tsv"
output = "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm.csv"

hash_it = lambda x : hashlib.sha224(x.encode('utf-8')).hexdigest()

users = []
items = []
timestamps = []

print('Reading file {}'.format(path))
with open(path, 'r', encoding="utf8") as f:
    print_timer = time()
    refused = 0
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, timestamp, _, artist, _, music = line.split("\t")
        if time() - print_timer > 2:
            print_timer = time()
            clear_output()
            print("Index: {} Completed: {:.2f}% Users: {} Items: {}".format(index, (index + 1)/19150868 * 100, len(all_users), len(all_items)))
            print_controller = 0
        item = (artist, music)
        users.append(int(user[-4:]))
        items.append(hash_it(str(item)))
        dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
        timestamps.append(int(datetime.timestamp(dt)))
    df = pd.DataFrame({"User": users, "Item": items, "Timestamp": timestamps})
        
clear_output()
print("Index: {} Completed: {:.2f}% Users: {} Items: {}".format(index, (index + 1)/19150868 * 100, len(all_users), len(all_items)))
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

items = df.groupby("Item").count().index.values
item_dict = {}
    
for index, item in enumerate(items):
    item_dict[item] = index

del df
del users
del timestamp

with open(path, 'r', encoding="utf8") as f:
    with open(output, 'w+') as out:
        print_timer = time()
        refused = 0
        for index, line in enumerate(f):
            line = line.replace("\n", "")
            if line == "":
                continue
            user, timestamp, _, artist, _, music = line.split("\t")
            if time() - print_timer > 2:
                print_timer = time()
                clear_output()
                print("Index: {} Completed: {:.2f}% Users: {} Items: {}".format(index, (index + 1)/19150868 * 100, len(all_users), len(all_items)))
                print_controller = 0
            item = (artist, music)
            dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
            out.write("{},{},1,{}\n".format(int(user[-4:]), item_dict[hash_it(str(item))], int(datetime.timestamp(dt))))

In [None]:
import pandas as pd
import hashlib
from datetime import datetime
from IPython.display import clear_output
from time import time

path = "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm-preprocessed.csv"
out = "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm.csv"

hash_it = lambda x : hashlib.sha224(x.encode('utf-8')).hexdigest()

users = []
items = []
timestamps = []

print('Reading file {}'.format(path))
with open(path, 'r', encoding="utf8") as f:
    print_timer = time()
    refused = 0
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, _, timestamp = line.split(",")
        if time() - print_timer > 2:
            print_timer = time()
            clear_output()
            print("Index: {} Completed: {:.2f}% Users: {} Items: {}".format(index, (index + 1)/19150868 * 100, len(all_users), len(all_items)))
            print_controller = 0
        items.append(item)
        users.append(user)
        timestamps.append(timestamp)
    df = pd.DataFrame({"User": users, "Item": items, "Timestamp": timestamps})
        
clear_output()
print("Index: {} Completed: {:.2f}% Users: {} Items: {}".format(index, (index + 1)/19150868 * 100, len(all_users), len(all_items)))
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print("Writing csv files")
with open(output, 'w+') as f:
    for index, row in df.iterrows():
        f.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Completed")

In [None]:
import pandas as pd
import numpy as np

out = "D:\\recsys\\datasets\\LastFM\\lastfm-dataset-360K\\lastfm-dataset-1K\\last-fm-K{}.csv"

k_values = np.arange(2,15)

for k in k_values:
    with open(out.format(k), 'w+') as f:
        print("K{}-Sampling.. ".format(k), end="")
        sampled = k_sampling(df, k)
        print("OK")        
        print("Writing file.. ", end="")
        for index, row in sampled.iterrows():
            f.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
        print("OK")

## CiaoDVD

In [None]:
import pandas as pd
from datetime import datetime
from time import time

path = "D:\\recsys\\datasets\\CiaoDVD\\movie-ratings.txt"
output = "D:\\recsys\\datasets\\CiaoDVD\\ciaodvd.csv"
output_positive = "D:\\recsys\\datasets\\CiaoDVD\\ciaodvd-gte.csv"


users = []
items = []
ratings = []
timestamps = []

# file = []

print('Reading file {}'.format(path))
with open(path, 'r', encoding="utf8") as f:
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, _, _, rating, timestamp = line.split(",")
        users.append(int(user))
        items.append(int(item))
        ratings.append(int(rating))
        dt = datetime.strptime(timestamp, "%Y-%m-%d")
        timestamps.append(int(datetime.timestamp(dt)))
    df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('Optimizing indexes')
df = optimize(df)


print("Writing csv files")
with open(output, 'w+') as f:
    with open(output_positive, "w+") as f_positive:
        for index, row in df.iterrows():
            f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
            if row['Rating'] == 5:
                f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Completed")

## EachMovie

In [None]:
import pandas as pd
from datetime import datetime
from time import time



path = "D:\\recsys\\datasets\\EachMovie\\Vote.txt"
output = "D:\\recsys\\datasets\\EachMovie\\eachmovie-K{}.csv"
output_positive = "D:\\recsys\\datasets\\EachMovie\\eachmovie-K{}-gte.csv"

def zero_padding(datetime):
    date, time = datetime.split(" ")
    zdate = map('{0:02d}'.format, map(int, date.split("/")))
    ztime = map('{0:02d}'.format, map(int, time.split(":")))
    return "{} {}".format("/".join(zdate), ":".join(ztime)) 


users = []
items = []
ratings = []
timestamps = []
k_value = 5
# file = []

print('Reading file {}'.format(path))
with open(path, 'r', encoding="utf8") as f:
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        user, item, rating, _, timestamp = line.split("\t")
        users.append(int(user))
        items.append(int(item))
        ratings.append(int(float(rating)*5))
        dt = datetime.strptime(zero_padding(timestamp), "%m/%d/%y %H:%M:%S")
        timestamps.append(int(datetime.timestamp(dt)))
    df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('Optimizing indexes')
df = k_sampling(df, k_value)


print("Writing csv files")
with open(output.format(k_value), 'w+') as f:
    with open(output_positive.format(k_value), "w+") as f_positive:
        for index, row in df.iterrows():
            f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
            if row['Rating'] == 5:
                f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Completed")

## MovieTweetings

In [None]:
import pandas as pd
from time import time

paths = ["D:\\recsys\\datasets\\MovieTweetings\\test.dat", "D:\\recsys\\datasets\\MovieTweetings\\training.dat", "D:\\recsys\\datasets\\MovieTweetings\\evaluation.dat"]

output = "D:\\recsys\\datasets\\MovieTweetings\\movietweetings.csv"
output_positive = "D:\\recsys\\datasets\\MovieTweetings\\movietweetings-gte.csv"

users = []
items = []
ratings = []
timestamps = []

def process_base(path):
    print('Reading file {}'.format(path))
    with open(path, 'r', encoding="utf8") as f:
        for index, line in enumerate(f):
            if index == 0:
                continue
            line = line.replace("\n", "")
            if line == "":
                continue
            line = line.split("{")[0].split(",")
            user, item, rating, timestamp, _ = line
            if int(rating) > 10:
                continue
            users.append(int(user))
            items.append(int(item))
            ratings.append(int(rating))
            timestamps.append(int(timestamp))

for path in paths:
    process_base(path)

df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print('Optimizing indexes')
df = optimize(df)

print("Writing csv files")
with open(output, 'w+') as f:
    with open(output_positive, "w+") as f_positive:
        for index, row in df.iterrows():
            f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
            if row['Rating'] == 5:
                f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
print("Completed")

## YELP

In [None]:
import pandas as pd
from datetime import datetime
from time import time
from IPython.display import clear_output
import json


path = "D:\\recsys\\datasets\\YELP\\yelp_dataset\\yelp_academic_dataset_review.json"

output = "D:\\recsys\\datasets\\YELP\\yelp_dataset\\yelp.csv"
output_positive = "D:\\recsys\\datasets\\YELP\\yelp_dataset\\yelp-gte.csv"

k_values = [5, 2]

users = []
items = []
ratings = []
timestamps = []

print_timer = time()

print('Reading file {}'.format(path))
with open(path, 'r', encoding="utf8") as f:
    for index, line in enumerate(f):
        line = line.replace("\n", "")
        if line == "":
            continue
        data = json.loads(line.split(',"text"')[0] + "}")
        user, item, rating, date = data["user_id"], data["business_id"], data["stars"], data["date"]
        users.append(str(user))
        items.append(str(item))
        ratings.append(int(rating))
        dt = datetime.strptime(date, "%Y-%m-%d")
        timestamps.append(int(datetime.timestamp(dt)))
        if time() - print_timer > 2:
            print_timer = time()
            clear_output()
            print("Index: {} Completed: {:.2f}%".format(index, (index + 1)/5996997 * 100))
            print_controller = 0
clear_output()
print("Index: {} Completed: {:.2f}%".format(index, (index + 1)/5996997 * 100))

df = pd.DataFrame({"User": users, "Item": items, "Rating": ratings, "Timestamp": timestamps})
        
print('Sorting by timestamp')
df = df.sort_values(by=["Timestamp"])

print("Grouping users and items ... ", end="")
items = df.groupby("Item").count().index.values
users = df.groupby("User").count().index.values
item_dict = {}
user_dict = {}
print("OK")

    
print("Adding to a dictionary ... " , end="")
for index, item in enumerate(items):
    item_dict[item] = index

for index, user in enumerate(users):
    user_dict[user] = index
print("OK")        

del df
del users
del items
del timestamps

print("Writing csv files ... ", end="")
with open(path, 'r', encoding="utf8") as f:
    with open(output, 'w+') as out:
        with open(output_positive, "w+") as f_positive:
            for index, line in enumerate(f):
                if time() - print_timer > 2:
                    print_timer = time()
                    clear_output()
                    print("Writing csv files ... ")
                    print("Index: {} Completed: {:.2f}%".format(index, (index + 1)/5996997 * 100))
                line = line.replace("\n", "")
                data = json.loads(line.split(',"text"')[0] + "}")
                user, item, rating, date = user_dict[data["user_id"]], item_dict[data["business_id"]], data["stars"], data["date"]
                timestamp = int(datetime.timestamp(datetime.strptime(date, "%Y-%m-%d")))
                out.write("{},{},{},{}\n".format(user, item, rating, timestamp))
                if rating == 5:
                    f_positive.write("{},{},1,{}\n".format(user, item, timestamp))
clear_output()
print("Writing csv files ... ")
print("Index: {} Completed: {:.2f}%".format(index, (index + 1)/5996997 * 100))
print("Completed")
# for k in k_values:
#     print('K{}-Sampling'.format(k))
#     st = k_sampling(df, k)
#     print("Compression rate: {:.2f}% Ratings: {}".format(100 - st.shape[0]/df.shape[0]*100, st.shape[0]))

#     print("Writing K{} csv files".format(k))
#     with open(output.format(k), 'w+') as f:
#         with open(output_positive.format(k), "w+") as f_positive:
#             for index, row in st.iterrows():
#                 f.write("{},{},{},{}\n".format(row['User'], row['Item'], row['Rating'], row['Timestamp']))
#                 if row['Rating'] == 5:
#                     f_positive.write("{},{},1,{}\n".format(row['User'], row['Item'], row['Timestamp']))
# print("Completed")

In [None]:
df = k_sampling(df, 5)

In [None]:
df