In [1]:
// Swift package installation must go in the first cell.
// Set install-location so that all notebooks in this directory
// will share packages without having to recompile them.
// I think this only needs to be run once per host, then in the future
// you can skip this and go to cell 2 which imports things that have been precompiled.
%install-location $cwd/swift-jupyter-install-location
%install '.package(url: "https://github.com/mxcl/Path.swift", from: "1.2.0")' Path
%install '.package(url: "https://github.com/swiftcsv/SwiftCSV", from: "0.5.6")' SwiftCSV

Installing packages:
	.package(url: "https://github.com/mxcl/Path.swift", from: "1.2.0")
		Path
	.package(url: "https://github.com/swiftcsv/SwiftCSV", from: "0.5.6")
		SwiftCSV
With SwiftPM flags: []
Working in: /tmp/tmptlwx94hl/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[2/3] Merging module jupyterInstalledPackages
Initializing Swift...
Installation complete!


In [2]:
// Imports and Jupyter boiler plate
import Foundation
import FoundationNetworking
import Path
import PythonKit
import SwiftCSV
import TensorFlow

// This cell is here to display the plots in a Jupyter Notebook.
// Do not copy it into another environment.
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

let plt = Python.import("matplotlib.pyplot")
let tarfile = Python.import("tarfile")
let zipfile = Python.import("zipfile")

In [3]:
// TODO: move this to a library or use SwiftAI
func download(from source: URL, to destination: Path, force: Bool = false) {
    if (destination.exists && !force) {
        return
    }
    let data = try! Data.init(contentsOf: source)
    if (!destination.parent.exists) {
        try! destination.parent.mkdir(.p)
    }
    try! data.write(to: destination)
}

In [4]:
let dataPath = Path.cwd/"data"
// Full data set: "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
// Sample data set: "http://files.fast.ai/data/examples/movie_lens_sample.tgz"
let movieLensURL = URL(string: "http://files.grouplens.org/datasets/movielens/ml-100k.zip")!
let downloadedArchive = dataPath/movieLensURL.lastPathComponent
download(from: movieLensURL, to: downloadedArchive)
print(downloadedArchive)

/home/garymm_gmail_com/src/fastai/course-v3/nbs/dl1-swift/data/ml-100k.zip


In [5]:
var archiveFile : PythonObject = 1
switch downloadedArchive.extension {
    case "tgz":
        archiveFile = tarfile.open(downloadedArchive.string)
    case "zip":
        archiveFile = zipfile.ZipFile(downloadedArchive.string)
    default:
        print("Unknown extension:", downloadedArchive.extension)
}
archiveFile.extractall(path: dataPath.string)
archiveFile.close()

// See extractedDir/README for details on the files and their contents.
let userIdColIndex = 0
let movieIdColIndex = 1
let ratingColIndex = 2

let extractedDir = dataPath/downloadedArchive.basename(dropExtension: true)
let trainTSV: CSV = CSV(url: URL(fileURLWithPath: (extractedDir/"ua.base").string), delimiter: "\t", loadColumns: false)
let validTSV: CSV = CSV(url: URL(fileURLWithPath: (extractedDir/"ua.test").string), delimiter: "\t", loadColumns: false)
print("loaded \(trainTSV.namedRows.count) training rows, \(validTSV.namedRows.count) validation rows")

loaded 90569 training rows, 9429 validation rows


In [6]:
/// Examples from the collaborative filtering dataset.
struct CollabData {
    // [batchSize, featureCount] tensor of features.
    let features: Tensor<Int32>

    // [batchSize] tensor of labels.
    let labels: Tensor<Float>
}

/// Conform `CollabData` to `Collatable` so that we can load it into a `TrainingEpoch`.
extension CollabData: Collatable {
    public init<BatchSamples: Collection>(collating samples: BatchSamples)
        where BatchSamples.Element == Self {
        // `CollabData`s are collated by stacking their feature and label tensors
        // along the batch axis to produce a single feature and label tensor
        features = Tensor<Int32>(stacking: samples.map{$0.features})
        labels = Tensor<Float>(stacking: samples.map{$0.labels})
    }
}

In [7]:
var trainRatingsArray: [Float] = [], validRatingsArray: [Float] = []
// UInt16 would be more efficient for this application, but Embedding takes only Int32.
var trainIdsArray: [[Int32]] = [], validIdsArray: [[Int32]] = []

var maxUserId: Int32 = 0
var maxMovieId: Int32 = 0

// TODO: this boolean is ugly. Is there a better way to do this?
func addToArray(row: [String], isTrain: Bool) {
    if row.count < ratingColIndex + 1 {
        return
    }
    guard let userId = Int32(row[userIdColIndex]) else {
        print("Failed to convert element \(userIdColIndex) of row to Int32: \(row)")
        return
    }
    guard let movieId = Int32(row[movieIdColIndex]) else {
        print("Failed to convert element \(movieIdColIndex) of row to Int32: \(row)")
        return
    }
    
    guard let rating = Float(row[ratingColIndex]) else {
        print("Failed to convert to Float: \(row[ratingColIndex])")
        return
    }
    maxUserId = max(maxUserId, userId)
    maxMovieId = max(maxMovieId, movieId)
    if isTrain {
        trainIdsArray.append([userId, movieId])
        trainRatingsArray.append(rating)
    } else {
        validIdsArray.append([userId, movieId])
        validRatingsArray.append(rating)
    }
}

try trainTSV.enumerateAsArray { addToArray(row: $0, isTrain: true) }
try validTSV.enumerateAsArray { addToArray(row: $0, isTrain: false) }

In [8]:
// trainDataSet is an array of CollabData, one per example.
let trainDataset = zip(trainIdsArray, trainRatingsArray).map {
    CollabData(features: Tensor<Int32>($0.0), labels: Tensor<Float>($0.1))
}
let validDataset = zip(validIdsArray, validRatingsArray).map {
    CollabData(features: Tensor<Int32>($0.0), labels: Tensor<Float>($0.1))
}

print("traingDataset:")
for i in 0 ... 5 {
    print("Row \(i):", trainDataset[i].features, trainDataset[i].labels)
}

let batchSize = 64
let trainEpochs = TrainingEpochs(samples: trainDataset, batchSize: batchSize)
let validEpochs = TrainingEpochs(samples: validDataset, batchSize: batchSize)

let firstTrainEpoch = trainEpochs.next()!
let firstTrainBatch = firstTrainEpoch.first!.collated
let firstTrainFeatures = firstTrainBatch.features
let firstTrainLabels = firstTrainBatch.labels

print("First batch of features: \(firstTrainFeatures)")
print("firstTrainFeatures.shape: \(firstTrainFeatures.shape)")
print("First batch of labels: \(firstTrainLabels)")
print("firstTrainLabels.shape: \(firstTrainLabels.shape)")

traingDataset:
Row 0: [1, 1] 5.0
Row 1: [1, 2] 3.0
Row 2: [1, 3] 4.0
Row 3: [1, 4] 3.0
Row 4: [1, 5] 3.0
Row 5: [1, 6] 5.0
First batch of features: [[ 785,  748],
 [ 594,   19],
 [ 661,  357],
 [ 271,   88],
 [ 184,  161],
 [   7,   69],
 [ 577,  549],
 [ 727,  635],
 [ 851,  109],
 [ 391,  288],
 [ 305,   11],
 [ 497,   73],
 [ 647,   82],
 [ 735,  332],
 [ 524,  928],
 [ 452,  154],
 [ 134,  294],
 [ 913,  210],
 [ 503,  385],
 [ 463,  887],
 [ 833,   28],
 [ 234, 1269],
 [ 757,  231],
 [ 350,  515],
 [   1,  254],
 [ 197,  550],
 [ 645,   96],
 [ 566,   97],
 [ 119,  182],
 [ 429,  629],
 [ 104,  222],
 [ 342,   25],
 [ 363,  423],
 [ 291,  393],
 [  18,  425],
 [ 809,  245],
 [ 407,  519],
 [ 346,   55],
 [ 306, 1514],
 [ 537,  745],
 [  82,  230],
 [ 833,  249],
 [   2,  303],
 [ 708,  756],
 [ 332,  682],
 [ 509,  294],
 [ 119,  931],
 [ 423,  924],
 [ 881,  472],
 [ 318, 1014],
 [ 808,  751],
 [ 249,  483],
 [ 585,  170],
 [ 280,  155],
 [ 545,  434],
 [ 345,    4],
 [  76,  513

In [9]:
struct EmbeddingDotBias: Module {
    var uWeight, iWeight, uBias, iBias: Embedding<Float>
    
    @noDerivative
    let yRange: (Float, Float)?
    
    init(nFactors: Int, nUsers: Int, nItems: Int, yRange: (Float, Float)?) {
        self.uWeight = Embedding<Float>(vocabularySize: nUsers, embeddingSize: nFactors)
        self.iWeight = Embedding<Float>(vocabularySize: nItems, embeddingSize: nFactors)
        self.uBias = Embedding<Float>(vocabularySize: nUsers, embeddingSize: 1)
        self.iBias = Embedding<Float>(vocabularySize: nItems, embeddingSize: 1)
        self.yRange = yRange
    }
    
    @differentiable
    func callAsFunction(_ input: Tensor<Int32>) -> Tensor<Float> {
        let inputParts = input.split(count: 2, alongAxis: 1)
        let users = inputParts[userIdColIndex].squeezingShape()
        let items = inputParts[movieIdColIndex].squeezingShape()
        let dotProds = (uWeight(users) * iWeight(items)).sum(alongAxes: 1).squeezingShape()
        let res = dotProds + uBias(users).squeezingShape() + iBias(items).squeezingShape()
        
        guard let (yMin, yMax) = yRange else {
            return res
        }
        return sigmoid(res) * (yMax - yMin) + yMin
    }
}

In [10]:
// Set the yRange slightly larger than the actual range of ratings because sigmoid approaches
// the limits asymptotically.
var model = EmbeddingDotBias(nFactors: 16, nUsers: Int(maxUserId), nItems: Int(maxMovieId), yRange: (-0.25, 5.25))
let optimizer = Adam(for: model, learningRate: 0.01)

In [20]:
let epochCount = 1
var trainLosses: [Tensor<Float>] = []
var validLosses: [Tensor<Float>] = []
var trainBatchCount, validBatchCount: Int

In [21]:
for (epochIndex, epoch) in trainEpochs.prefix(epochCount).enumerated() {
    trainBatchCount = 0; validBatchCount = 0
    for batchSamples in epoch {
        let batch = batchSamples.collated
        let (loss, grad) = valueWithGradient(at: model) { (model: EmbeddingDotBias) -> Tensor<Float> in
            return meanSquaredError(predicted: model(batch.features), expected: batch.labels)
        }
        optimizer.update(&model, along: grad)        
        trainLosses.append(loss)
        trainBatchCount += 1
    }
    let validSamples = validEpochs.next()!
    for batchSamples in validSamples {
        let batch = batchSamples.collated
        let loss = meanSquaredError(predicted: model(batch.features), expected: batch.labels)
        validLosses.append(loss)
        validBatchCount += 1
    }
    print("Done with epoch:", epochIndex)
}

Done with epoch: 0


In [25]:
var trainLossesPerEpoch: [Float] = []
var validLossesPerEpoch: [Float] = []
for i in 0..<epochCount {
    var trainLoss: Float = 0
    var validLoss: Float = 0
    for b in 0..<trainBatchCount {
        trainLoss += trainLosses[b].scalarized()
    }
    for b in 0..<validBatchCount {
        validLoss += validLosses[b].scalarized()
    }
    trainLossesPerEpoch.append(trainLoss / Float(trainBatchCount))
    validLossesPerEpoch.append(trainLoss / Float(validBatchCount))
}
print("trainLossesPerEpoch:", trainLossesPerEpoch)
print("validLossesPerEpoch:", validLossesPerEpoch)

trainLossesPerEpoch: [0.82688904]
validLossesPerEpoch: [7.95951]


In [None]:
let validDataset = zip(validIdsArray, validRatingsArray).map {
    CollabData(features: Tensor<Int32>($0.0), labels: Tensor<Float>($0.1))
}
print("validation dataset size:", validDataset.count)

In [None]:
// TODO: fastai has a nice way of computing the loss on the entire validation set.
// Figure out how that's done rather than re-inventing mean squared error here.
var squaredError: Double = 0

for batchSamples in validDataset.inBatches(of: batchSize) {
    let batch = batchSamples.collated
    squaredError += Double(squaredDifference(model(batch.features), batch.labels).sum().scalarized())
}

let meanSquaredError = squaredError / Double(validDataset.count)
print("meanSquaredError on validation set:", meanSquaredError)

In [None]:
// TODO:
// * Try this out on the full data set and see if performance is better
// * Get the movie names and inspect the movie biases
// * Visualize embeddings with principle component analysis