# FFM Model

## Imports

In [1]:
import random
import pandas as pd
import numpy as np
from hashlib import sha256

In [2]:
from pyspark.sql import SparkSession

app_name = "final_project_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

## Generate a small sample to work with

In [3]:
%%writefile sample.txt
1	10	ESPN	Nike
1	15	ESPN	Nike
0	2	ESPN	Gucci
1	10	ESPN	Adidas
1	10	ESPN	Adidas
0	3	Vogue	Nike
1	20	Vogue	Gucci
0	5	Vogue	Adidas
1	50	NBC	Nike
0	0	NBC	Gucci
0	4	NBC	Adidas
0	4	NBC	Adidas

Overwriting sample.txt


In [4]:
sample_RDD = sc.textFile('sample.txt')
split_RDD = sample_RDD.map(lambda line: line.split('\t')).cache()

In [5]:
sample_df = split_RDD.toDF()
sample_df.show()

+---+---+-----+------+
| _1| _2|   _3|    _4|
+---+---+-----+------+
|  1| 10| ESPN|  Nike|
|  1| 15| ESPN|  Nike|
|  0|  2| ESPN| Gucci|
|  1| 10| ESPN|Adidas|
|  1| 10| ESPN|Adidas|
|  0|  3|Vogue|  Nike|
|  1| 20|Vogue| Gucci|
|  0|  5|Vogue|Adidas|
|  1| 50|  NBC|  Nike|
|  0|  0|  NBC| Gucci|
|  0|  4|  NBC|Adidas|
|  0|  4|  NBC|Adidas|
+---+---+-----+------+



## Feature Hashing

In [6]:
def feature_hash(x, modulo=10**6):
    """
    Now we create a function that can be used to hash the features in each observation in the RDD. 
    We replace the label with 1, -1 and we hash all other features using sha256 
    and then we take modulo some power of 10. 
    """

    x[0] = 2*int(x[0]) - 1
    for i, value in enumerate(x[1:], 1):
        h = sha256("{i}-{val}".format(i=i,val=value).encode('ascii'))
        hashed_value = int(h.hexdigest(), base=16) 
        hashed_value_mod = hashed_value % modulo
        x[i] = hashed_value_mod
    return x

In [147]:
sample_hashed = split_RDD.map(lambda x: feature_hash(x, 25))

In [148]:
sample_hashed.collect()

[[1, 1, 16, 4],
 [1, 7, 16, 4],
 [-1, 2, 16, 18],
 [1, 1, 16, 11],
 [1, 1, 16, 11],
 [-1, 2, 9, 4],
 [1, 14, 9, 18],
 [-1, 12, 9, 11],
 [1, 18, 23, 4],
 [-1, 10, 23, 18],
 [-1, 22, 23, 11],
 [-1, 22, 23, 11]]

## Model

In [198]:
# class FFM:
#     def __init__(self, n_features, k = 10, eta = 0.1, reg_c = 0.1):
#         self.n_features = n_features
#         self.k = k
#         self.eta = eta
#         self.reg_c = reg_c
        
        
# ffm = FFM(25, k=3)
# ffm.n_features

In [199]:
def phi(x):
    return np.sum([np.dot(W[x[i], :], W[x[j], :]) for i in range(len(x) - 1) for j in range(i + 1, len(x))])

def kappa(y, features):
    return -y/(1 + np.exp(phi(features)))

def gradient(x):
    y = x[0]
    features = x[1:]
    k = kappa(y, features)
    
    gradients = np.zeros(shape=(25, 3))
    
    for i in range(len(features) - 1):
        for j in range(i+1, len(features)):
            gradients[features[i]] += k * W[features[j], :] #+ reg_c * W[features[i], :]
            
    for i in range(len(features) - 1):
        for j in range(i+1, len(features)):
            gradients[features[j]] += k * W[features[i], :] #+ reg_c * W[features[j], :]
            
    return gradients
    

In [200]:
# set the seed
np.random.seed(1)

# Initialize first random model
k = 3
n_features = 25
eta = 0.1
reg_c = 0.1
sc.broadcast(k)
sc.broadcast(n_features)
sc.broadcast(reg_c)
sc.broadcast(eta)

W = np.random.uniform(0, 1/np.sqrt(k), size=(n_features, k))

In [201]:
sample_hashed.take(1)

[[1, 1, 16, 4]]

In [202]:
wj0f1_wj1f0 = np.dot(W[1, :], W[16, :])
wj0f2_wj2f0 = np.dot(W[1, :], W[4, :])
wj1f2_wj2f1 = np.dot(W[16, :], W[4, :])
total = wj0f1_wj1f0 + wj0f2_wj2f0 + wj1f2_wj2f1
print(f"Expected value is the sum of these three: {total}")

Expected value is the sum of these three: 0.15821071145153956


In [203]:
sample_hashed.map(lambda x: phi(x[1:])).collect()[0]

0.15821071145153956

In [204]:
def log_loss(dataRDD, W):
    return dataRDD.map(lambda x: np.log(1 + np.exp(-x[0] * phi(x[1:])))).mean()

def gd_update(dataRDD, W):
    grad = dataRDD.map(lambda x: gradient(x)).mean()
    
    new_model = W - eta * grad
    return new_model

In [205]:
n_steps = 100
for i in range(n_steps):
    print("----------")
    print(f"STEP: {i+1}")
    W = gd_update(sample_hashed, W)
    loss = log_loss(sample_hashed, W)
    print(f"Loss: {loss}")
#     print(f"Model: {[round(w,3) for w in model]}")

----------
STEP: 1
Loss: 0.7952802105808737
----------
STEP: 2
Loss: 0.7897749304025715
----------
STEP: 3
Loss: 0.7843003785165544
----------
STEP: 4
Loss: 0.7788562566237199
----------
STEP: 5
Loss: 0.7734422786398728
----------
STEP: 6
Loss: 0.7680581706923254
----------
STEP: 7
Loss: 0.7627036711780439
----------
STEP: 8
Loss: 0.7573785308684284
----------
STEP: 9
Loss: 0.7520825130447141
----------
STEP: 10
Loss: 0.7468153936470999
----------
STEP: 11
Loss: 0.7415769614200466
----------
STEP: 12
Loss: 0.7363670180357952
----------
STEP: 13
Loss: 0.7311853781780426
----------
STEP: 14
Loss: 0.7260318695679198
----------
STEP: 15
Loss: 0.7209063329149344
----------
STEP: 16
Loss: 0.7158086217764081
----------
STEP: 17
Loss: 0.7107386023101187
----------
STEP: 18
Loss: 0.7056961529063783
----------
STEP: 19
Loss: 0.7006811636876074
----------
STEP: 20
Loss: 0.6956935358655668
----------
STEP: 21
Loss: 0.6907331809487837
----------
STEP: 22
Loss: 0.6858000197952805
----------
STEP: 23