In [1]:
using Distributed, CSV, LinearAlgebra, Random, Distributions, OnlineStats
using SparseArrays, SharedArrays, DistributedArrays
using HDF5, Tables, DataFrames, SQLite, Suppressor

In [2]:
file = "ratings.csv"
bigfile = "/Users/guillaume/Downloads/ml-latest/ratings.csv"

"/Users/guillaume/Downloads/ml-latest/ratings.csv"

In [3]:
csv = CSV.File(file, use_mmap=true)

CSV.File("ratings.csv", rows=100836):
Tables.Schema:
 :userId     Union{Missing, Int64}  
 :movieId    Union{Missing, Int64}  
 :rating     Union{Missing, Float64}
 :timestamp  Union{Missing, Int64}  

In [4]:
;head -n 5 $file

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815


In [5]:
mutable struct Rating
    itemId::Int64
    userId::Int64
    value::Float64
end

mutable struct Item
    weights::Vector{Float64}
    bias::Float64
end

mutable struct User
    weights::Vector{Float64}
    bias::Float64
end

In [6]:
function create_h5_file(filename::String)
    h5open(filename, "cw") do fid
    end
end

function initialize_h5_dataset(filename::String, dataset::String, dims::Tuple{Int,Int};
        chunksize::Int64=1000)
    m, n = dims
    h5open(filename, "r+", "libver_bounds", (HDF5.H5F_LIBVER_LATEST, HDF5.H5F_LIBVER_LATEST)) do fid 
        d = d_create(fid, dataset, datatype(Float64), dataspace(m, n)) 
        for chunk in Iterators.partition(1:m, chunksize)
            r = UnitRange(extrema(chunk)...)
            l = length(r)
            d[r,:] = hcat(zeros(l), rand(Normal(0.0, 1e-4), l, n-1))
        end
    end
end

function open_h5_dataset(fun::Function, filename::String, dataset::String)
    fid = h5open(filename, "r+")
    dset = fid["$(dataset)"]
    if ismmappable(dset)
        dset = readmmap(dset)
    end
    try
        fun(dset)
    finally
        close(fid)
    end
end

open_h5_dataset (generic function with 1 method)

In [7]:
function csv2db(f::Function, csv::CSV.File)
    dbfile::String = tempname()
    db::SQLite.DB = SQLite.DB(dbfile)
    table::String = first(splitext(csv.name))
    csv |> SQLite.load!(db, table)
    try
        f(db, table)
    finally
        rm(dbfile)
    end
end

csv2db (generic function with 1 method)

In [None]:
mutable struct UserFactorJob
    id::Int64
    input::Channel
    output::Channel
end

mutable struct ItemFactorJob
    id::Int64
    input::Channel
    output::Channel
end

In [None]:
function alsbiased2(db::SQLite.DB, table::String, k::Int64=10;
        nepochs::Int64=10,
        reg::Float64=0.0001,
        cb::Union{Nothing, Function}=nothing)
    
    print("Computing metrics...")
    μ::Float64 = @suppress_err SQLite.query(db, "SELECT AVG(rating) as mean FROM $table")[1, :mean]
    rated_items::Vector{Int64} = @suppress_err collect(skipmissing(SQLite.query(db, "SELECT DISTINCT movieId FROM $table ORDER BY movieId").movieId))
    rating_users::Vector{Int64} = @suppress_err collect(skipmissing(SQLite.query(db, "SELECT DISTINCT userId FROM $table ORDER BY userId").userId))
    m::Int64 = @suppress_err SQLite.query(db, "SELECT MAX(movieId) as max_item FROM $table")[1, :max_item]
    n::Int64 = @suppress_err SQLite.query(db, "SELECT MAX(userId) as max_user FROM $table")[1, :max_user]
    println("DONE")
    
    print("Sorting database...")
    @suppress_err SQLite.execute!(db, "CREATE TABLE item_sorted AS SELECT movieId as itemId, userId, rating as value FROM $table ORDER BY itemId ASC");
    @suppress_err SQLite.execute!(db, "CREATE TABLE user_sorted AS SELECT movieId as itemId, userId, rating as value FROM $table ORDER BY userId ASC");
    println("DONE")
    
    h5file::String = tempname()
    println(h5file)
    print("Creating HDF5 latent factors...")
    create_h5_file(h5file)
    initialize_h5_dataset(h5file, "P", (m, k+1), chunksize=1000)
    initialize_h5_dataset(h5file, "Q", (n, k+1), chunksize=100)
    println("DONE")
    
    print("Creating jobs channel...")
    jobs = Channel{Channel}(0)
    println("DONE")
    
    print("Creating results channel...")
    results = Channel{NamedTuple{(:id, :factor), Tuple{Int64, Vector{Float64}}}}(0)
    println("DONE")
    
    @async open_h5_dataset(h5file, "Q") do Q
        for r in results
            Q[r.id,:] = r.factor
            IJulia.clear_output(true)
            println("user $(r.id) processed.")
        end
    end

    @async open_h5_dataset(h5file, "P") do P
        limit = 1000
        offset = 0
        current_user = nothing
        user_chan = nothing
        while true
            df = @suppress_err SQLite.query(db, "SELECT itemId, userId, value FROM user_sorted LIMIT $limit OFFSET $offset")

            for r in DataFrames.eachrow(df)
                if r.userId != current_user
                    current_user = r.userId
                    
                    if user_chan !== nothing
                        close(user_chan)
                    end
                    
                    user_chan = Channel{NamedTuple{(:rating, :factor),Tuple{Rating, Vector{Float64}}}}(0)
                    put!(jobs, user_chan)
                end
                
                data = (rating=Rating(r.itemId, r.userId, r.value - μ - P[r.itemId, 1]), factor=[1.0; P[r.itemId, 2:end]])
                put!(user_chan, data)
            end
            
            if size(df, 1) < limit
                break
            end
            
            offset += limit
        end
        
        if user_chan !== nothing
            close(user_chan)
        end
        
        close(jobs)
    end

   for chan in jobs
        id = nothing
        o = LinReg()
        for data in chan
            id = data.rating.userId
            fit!(o, (data.factor, data.rating.value))
        end
        result = coef(o, reg)
        put!(results, (id=id, factor=result))
    end
    
    rm(h5file)
end

csv2db(csv) do db, table
    alsbiased2(db, table, 100)
end

In [None]:
addprocs(4)

function alsbiased3(db::SQLite.DB, table::String, k::Int64=10;
        nepochs::Int64=10,
        reg::Float64=0.0001,
        cb::Union{Nothing, Function}=nothing)
    
    print("Computing metrics...")
    μ::Float64 = @suppress_err SQLite.query(db, "SELECT AVG(rating) as mean FROM $table")[1, :mean]
    rated_items::Vector{Int64} = @suppress_err collect(skipmissing(SQLite.query(db, "SELECT DISTINCT movieId FROM $table ORDER BY movieId").movieId))
    rating_users::Vector{Int64} = @suppress_err collect(skipmissing(SQLite.query(db, "SELECT DISTINCT userId FROM $table ORDER BY userId").userId))
    m::Int64 = @suppress_err SQLite.query(db, "SELECT MAX(movieId) as max_item FROM $table")[1, :max_item]
    n::Int64 = @suppress_err SQLite.query(db, "SELECT MAX(userId) as max_user FROM $table")[1, :max_user]
    println("DONE")
    
    print("Sorting database...")
    @suppress_err SQLite.execute!(db, "CREATE TABLE item_sorted AS SELECT movieId as itemId, userId, rating as value FROM $table ORDER BY itemId ASC");
    @suppress_err SQLite.execute!(db, "CREATE TABLE user_sorted AS SELECT movieId as itemId, userId, rating as value FROM $table ORDER BY userId ASC");
    println("DONE")
    
    h5file::String = tempname()
    println(h5file)
    print("Creating HDF5 latent factors...")
    create_h5_file(h5file)
    initialize_h5_dataset(h5file, "P", (m, k+1), chunksize=1000)
    initialize_h5_dataset(h5file, "Q", (n, k+1), chunksize=100)
    println("DONE")
    
    print("Creating jobs channel...")
    jobs = Channel{Channel}(0)
    println("DONE")
    
    print("Creating results channel...")
    results = Channel{NamedTuple{(:id, :factor), Tuple{Int64, Vector{Float64}}}}(0)
    println("DONE")
    
    @async open_h5_dataset(h5file, "Q") do Q
        for r in results
            Q[r.id,:] = r.factor
            IJulia.clear_output(true)
            println("user $(r.id) processed.")
        end
    end

    @async open_h5_dataset(h5file, "P") do P
        limit = 1000
        offset = 0
        current_user = nothing
        user_chan = nothing
        while true
            df = @suppress_err SQLite.query(db, "SELECT itemId, userId, value FROM user_sorted LIMIT $limit OFFSET $offset")

            for r in DataFrames.eachrow(df)
                if r.userId != current_user
                    current_user = r.userId
                    
                    if user_chan !== nothing
                        close(user_chan)
                    end
                    
                    user_chan = Channel{NamedTuple{(:rating, :factor),Tuple{Rating, Vector{Float64}}}}(0)
                    put!(jobs, user_chan)
                end
                
                data = (rating=Rating(r.itemId, r.userId, r.value - μ - P[r.itemId, 1]), factor=[1.0; P[r.itemId, 2:end]])
                put!(user_chan, data)
            end
            
            if size(df, 1) < limit
                break
            end
            
            offset += limit
        end
        
        if user_chan !== nothing
            close(user_chan)
        end
        
        close(jobs)
    end

   for chan in jobs
        id = nothing
        o = LinReg()
        for data in chan
            id = data.rating.userId
            fit!(o, (data.factor, data.rating.value))
        end
        result = coef(o, reg)
        put!(results, (id=id, factor=result))
    end
    
    rm(h5file)
end

csv2db(csv) do db, table
    alsbiased2(db, table, 100)
end

rmprocs(nworkers())

In [None]:
addprocs(4)

println(workers())

@everywhere using SparseArrays, SharedArrays, LinearAlgebra

costs = []
csv = CSV.File(file, use_mmap=true);
ratings = (Rating(r.movieId, r.userId, r.rating) for r in csv);

@time P, Q = alsbiased2(ratings, 100;
            nepochs=10,
            reg=0.001,
            cb=(epoch, cost)->begin
                println("epoch: $(epoch), cost: $(cost)")
                push!(costs, cost)
            end)

rmprocs(workers());