# FFM Model

## Imports

In [1]:
import random
import pandas as pd
import numpy as np
from hashlib import sha256

from pyspark.sql import types
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
import numpy as np
import matplotlib.pyplot as plt

In [25]:
from pyspark.sql import SparkSession

app_name = "final_project_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

PWD = !pwd
PWD = PWD[0]

## Generate a small sample to work with

In [78]:
%%writefile data/sample.txt
1	10	ESPN	Nike
1	15	ESPN	Nike
0	2	ESPN	Gucci
1	10	ESPN	Adidas
1	10	ESPN	Adidas
0	3	Vogue	Nike
1	20	Vogue	Gucci
0	5	Vogue	Adidas
1	50	NBC	Nike
0	0	NBC	Gucci
0	4	NBC	Adidas
0	4	NBC	Adidas

Overwriting data/sample.txt


In [81]:
train_data = spark.read.csv(f"{PWD}/data/sample.txt", sep="\t")
train_data.write.format("parquet").save(f"{PWD}/data/sample.parquet")

In [4]:
sample_RDD = sc.textFile('sample.txt')
split_RDD = sample_RDD.map(lambda line: line.split('\t')).cache()

In [5]:
sample_df = split_RDD.toDF()
sample_df.show()

+---+---+-----+------+
| _1| _2|   _3|    _4|
+---+---+-----+------+
|  1| 10| ESPN|  Nike|
|  1| 15| ESPN|  Nike|
|  0|  2| ESPN| Gucci|
|  1| 10| ESPN|Adidas|
|  1| 10| ESPN|Adidas|
|  0|  3|Vogue|  Nike|
|  1| 20|Vogue| Gucci|
|  0|  5|Vogue|Adidas|
|  1| 50|  NBC|  Nike|
|  0|  0|  NBC| Gucci|
|  0|  4|  NBC|Adidas|
|  0|  4|  NBC|Adidas|
+---+---+-----+------+



## Feature Hashing

In [5]:
# train_parquet = spark.read.parquet(f"{PWD}/data/smallTrain.parquet")
train_parquet = spark.read.parquet(f"{PWD}/data/sample.parquet")

In [6]:
train_parquet.show()

+---+---+-----+------+
|_c0|_c1|  _c2|   _c3|
+---+---+-----+------+
|  1| 10| ESPN|  Nike|
|  1| 15| ESPN|  Nike|
|  0|  2| ESPN| Gucci|
|  1| 10| ESPN|Adidas|
|  1| 10| ESPN|Adidas|
|  0|  3|Vogue|  Nike|
|  1| 20|Vogue| Gucci|
|  0|  5|Vogue|Adidas|
|  1| 50|  NBC|  Nike|
|  0|  0|  NBC| Gucci|
|  0|  4|  NBC|Adidas|
|  0|  4|  NBC|Adidas|
+---+---+-----+------+



In [7]:
#rename files and recast integer types on the first 13 features

oldColNames = train_parquet.schema.names
cate_field_start = 2
cate_field_end = 4


train_parquet = train_parquet.withColumn("label", train_parquet["_c0"])
for colNum in range(1,cate_field_start): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("int_feature_"+ str(colNum), train_parquet[colName].cast(types.IntegerType()))
for colNum in range(cate_field_start,cate_field_end): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("cate_feature_"+ str(colNum-cate_field_start+1), train_parquet[colName])

#drop the old columns
train_parquet = train_parquet.drop(*oldColNames)

In [8]:
train_parquet.show()

+-----+-------------+--------------+--------------+
|label|int_feature_1|cate_feature_1|cate_feature_2|
+-----+-------------+--------------+--------------+
|    1|           10|          ESPN|          Nike|
|    1|           15|          ESPN|          Nike|
|    0|            2|          ESPN|         Gucci|
|    1|           10|          ESPN|        Adidas|
|    1|           10|          ESPN|        Adidas|
|    0|            3|         Vogue|          Nike|
|    1|           20|         Vogue|         Gucci|
|    0|            5|         Vogue|        Adidas|
|    1|           50|           NBC|          Nike|
|    0|            0|           NBC|         Gucci|
|    0|            4|           NBC|        Adidas|
|    0|            4|           NBC|        Adidas|
+-----+-------------+--------------+--------------+



In [9]:
#record feature names by feature type
intFieldNames = [colName for colName, dType in train_parquet.dtypes if dType == 'int']
cateFieldNames = [colName for colName, dType in train_parquet.dtypes if dType == 'string' and colName != 'label']

In [10]:
n_features = 100
n_fields = 3

In [11]:
from pyspark.ml.feature import FeatureHasher
hasher = FeatureHasher()
hasher.setCategoricalCols(intFieldNames)
hasher.setNumFeatures(n_features)

for col in intFieldNames + cateFieldNames:
    hasher.setInputCols([col])
    hasher.setOutputCol(col+"_hashed")
    train_parquet = hasher.transform(train_parquet)
#     train_parquet.show()

In [12]:
train_parquet.show()

+-----+-------------+--------------+--------------+--------------------+---------------------+---------------------+
|label|int_feature_1|cate_feature_1|cate_feature_2|int_feature_1_hashed|cate_feature_1_hashed|cate_feature_2_hashed|
+-----+-------------+--------------+--------------+--------------------+---------------------+---------------------+
|    1|           10|          ESPN|          Nike|    (100,[24],[1.0])|      (100,[7],[1.0])|     (100,[36],[1.0])|
|    1|           15|          ESPN|          Nike|    (100,[83],[1.0])|      (100,[7],[1.0])|     (100,[36],[1.0])|
|    0|            2|          ESPN|         Gucci|    (100,[15],[1.0])|      (100,[7],[1.0])|     (100,[50],[1.0])|
|    1|           10|          ESPN|        Adidas|    (100,[24],[1.0])|      (100,[7],[1.0])|     (100,[15],[1.0])|
|    1|           10|          ESPN|        Adidas|    (100,[24],[1.0])|      (100,[7],[1.0])|     (100,[15],[1.0])|
|    0|            3|         Vogue|          Nike|    (100,[79]

In [13]:
def parse(row):
    print(row[0])
    output = [int(row[0])]
    hashed_values = row[-n_fields:]
    
    for field, h_val in enumerate(hashed_values):
        if h_val.indices.size > 0:
            output.append((field, h_val.indices[0]))
    
    return output

sample_hashed = train_parquet.rdd.map(parse).cache()

In [14]:
sample_hashed.take(1)

[[1, (0, 24), (1, 7), (2, 36)]]

## Model

In [31]:
def GDUpdate_wReg(dataRDD, W, learningRate = 0.1, regParam = 0.1):    
    
    
    def phi(x):
        total = 0
        for i in range(len(x) - 1):
            for j in range(i+1, len(x)):
                total += np.dot(w_old.value[x[i][1], x[j][0], :], w_old.value[x[j][1], x[i][0], :])

        return total

    def kappa(y, features):
        return -y/(1 + np.exp(y*phi(features)))


    def G_j_f(x):
        y = x[0]
        features = x[1:]
        k = kappa(y, features)
        for i in range(len(features) - 1):
            for j in range(i+1, len(features)):
                yield ((features[i][1], features[j][0]), (k * w_old.value[features[i][1], features[j][0], :], 1))
                yield ((features[j][1], features[i][0]), (k * w_old.value[features[j][1], features[i][0], :], 1))
    
    
    
    new_model = None
    
    w_old = sc.broadcast(W)
    gradWOReg = dataRDD.flatMap(G_j_f).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).mapValues(lambda x: x[0]/x[1]).collect()
    
    grad = regParam * W
    
    for indices, vector in gradWOReg:
        feature_index = indices[0]
        field_index = indices[1]
        
        grad[feature_index, field_index, :] = grad[feature_index, field_index, :] + vector
    
    new_model = W - learningRate * grad
    
    return new_model            
            
def GradientDescent_wReg(trainRDD, wInit, nSteps = 20, learningRate = 0.1, regParam = 0.1):

    model = wInit
    for idx in range(nSteps):  
        # update the model
        model = GDUpdate_wReg(trainRDD, model, learningRate, regParam)
        
    return model
    

In [16]:
#set params
k = 3
n_features = n_features
n_fields = n_fields


In [30]:
np.random.seed(1)
wInit = np.random.uniform(0, 1/np.sqrt(k), size=(n_features, n_fields, k))
GradientDescent_wReg(sample_hashed, wInit, nSteps=2)

array([[[0.26068079, 0.01392432, 0.05496281],
        [0.05740934, 0.22024511, 0.40513716],
        [0.0299239 , 0.00706644, 0.0834475 ]],

       [[0.53002801, 0.4898857 , 0.22777021],
        [0.09127   , 0.20078123, 0.5289853 ],
        [0.36415896, 0.26655924, 0.22768423]],

       [[0.50732491, 0.55085028, 0.44062204],
        [0.47155648, 0.17376988, 0.53163636],
        [0.25616454, 0.1014621 , 0.33742419]],

       [[0.38710743, 0.24747567, 0.21126124],
        [0.39854429, 0.14697735, 0.38837724],
        [0.32532518, 0.18956745, 0.3829127 ]],

       [[0.02427307, 0.31055884, 0.33702649],
        [0.16252641, 0.20160592, 0.48834851],
        [0.35047784, 0.36211814, 0.05919528]],

       [[0.13462838, 0.40701285, 0.39580268],
        [0.2527259 , 0.02221133, 0.02851732],
        [0.05008071, 0.48908864, 0.07701083]],

       [[0.37248464, 0.32648264, 0.41115208],
        [0.329653  , 0.10152559, 0.41538928],
        [0.33693854, 0.30906475, 0.41145069]],

       [[0.18717215,