# FFM Model

## Imports

In [1]:
import random
import pandas as pd
import numpy as np
from hashlib import sha256

from pyspark.sql import types
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import time
import numpy as np
import matplotlib.pyplot as plt

# set the seed
np.random.seed(1)

In [38]:
import pyspark
# sc = pyspark.SparkContext()
train_parquet = pyspark.read.parquet("data/smallTrain.parquet")

AttributeError: module 'pyspark' has no attribute 'read'

In [2]:
from pyspark.sql import SparkSession

app_name = "final_project_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
PWD = !pwd
PWD = PWD[0]

## Generate a small sample to work with

In [4]:
%%writefile sample.txt
1	10	ESPN	Nike
1	15	ESPN	Nike
0	2	ESPN	Gucci
1	10	ESPN	Adidas
1	10	ESPN	Adidas
0	3	Vogue	Nike
1	20	Vogue	Gucci
0	5	Vogue	Adidas
1	50	NBC	Nike
0	0	NBC	Gucci
0	4	NBC	Adidas
0	4	NBC	Adidas

Overwriting sample.txt


In [5]:
sample_RDD = sc.textFile('sample.txt')
split_RDD = sample_RDD.map(lambda line: line.split('\t')).cache()

In [6]:
sample_df = split_RDD.toDF()
sample_df.show()

+---+---+-----+------+
| _1| _2|   _3|    _4|
+---+---+-----+------+
|  1| 10| ESPN|  Nike|
|  1| 15| ESPN|  Nike|
|  0|  2| ESPN| Gucci|
|  1| 10| ESPN|Adidas|
|  1| 10| ESPN|Adidas|
|  0|  3|Vogue|  Nike|
|  1| 20|Vogue| Gucci|
|  0|  5|Vogue|Adidas|
|  1| 50|  NBC|  Nike|
|  0|  0|  NBC| Gucci|
|  0|  4|  NBC|Adidas|
|  0|  4|  NBC|Adidas|
+---+---+-----+------+



# Load Dataset

## 1. Only run this if you haven't generated a train.parquet file

In [23]:
train_data = spark.read.csv(f"{PWD}/data/train.txt", sep="\t")
train_data.write.format("parquet").save(f"{PWD}/data/train.parquet")

AnalysisException: 'path file:/media/notebooks/f19-final-project-f19-team-15/data/train.parquet already exists.;'

## 2. Adjust this value to match the desired level of data to work with

In [8]:
# select which data to load:
# 1->sample.parquet
# 2->smallTrain.parquet
# 3->mediumTrain.parquet
# 4->train.parquet (full dataset)

DATA_TO_LOAD = 2

In [9]:
if DATA_TO_LOAD == 1:
    train_parquet = spark.read.parquet(f"{PWD}/data/sample.parquet")
    cate_field_start = 2
    cate_field_end = 4
else:
    if DATA_TO_LOAD == 2:
        train_parquet = spark.read.parquet(f"{PWD}/data/smallTrain.parquet")
    elif DATA_TO_LOAD == 3:
        train_parquet = spark.read.parquet(f"{PWD}/data/mediumTrain.parquet")
    else:
#         train_parquet = spark.read.parquet(f"{PWD}/data/train.parquet")
        train_parquet = spark.read.parquet(f"{PWD}/data/train.parquet")
    cate_field_start = 14
    cate_field_end = 40

In [10]:
#rename files and recast integer types on the numeric features

oldColNames = train_parquet.schema.names

train_parquet = train_parquet.withColumn("label", train_parquet["_c0"])
for colNum in range(1,cate_field_start): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("int_feature_"+ str(colNum), train_parquet[colName].cast(types.IntegerType()))
for colNum in range(cate_field_start,cate_field_end): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("cate_feature_"+ str(colNum-cate_field_start+1), train_parquet[colName])

#drop the old columns
adjusted_labels_train_parquet = train_parquet.drop(*oldColNames)

In [11]:
intFieldNames = [colName for colName, dType in adjusted_labels_train_parquet.dtypes if dType == 'int']
cateFieldNames = [colName for colName, dType in adjusted_labels_train_parquet.dtypes if dType == 'string' and colName != 'label']

In [12]:
adjusted_labels_train_parquet.show(1)

+-----+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+
|label|int_feature_1|int_feature_2|int_feature_3|int_feature_4|int_feature_5|int_feature_6|int_feature_7|int_feature_8|int_feature_9|int_feature_10|int_feature_11|int_feature_12|int_feature_13|cate_feature_1|cate_feature_2|cate_feature_3|cate_feature_4|cate_feature_5|cate_feature_6|cate_feature_7|cate_feature_8|cate_feature_9|cate_feature_10|cate_feature_11|cate_feature_12|cate_feature_13|cate_fe

# Feature Engineering
## Categorical Variables

In [13]:
threshold = 10

train_parquet_MD = adjusted_labels_train_parquet

for col in cateFieldNames:
    valuesToKeep = adjusted_labels_train_parquet.groupBy(col).count().filter(f"count >= {threshold}").select(col)
    valuesToKeep = valuesToKeep.withColumn("_"+col, adjusted_labels_train_parquet[col])
    valuesToKeep = valuesToKeep.drop(col)

    train_parquet_MD = train_parquet_MD.join(F.broadcast(valuesToKeep), train_parquet_MD[col] == valuesToKeep["_"+col], 'leftouter')
    train_parquet_MD = train_parquet_MD.withColumn(col, F.when(F.col("_"+col).isNull(), "***").otherwise(F.col("_"+col)))
    train_parquet_MD = train_parquet_MD.drop("_"+col)

In [14]:
# view data after the replacement
start = time.time()
train_parquet_reduced_dimensions = train_parquet_MD
train_parquet_reduced_dimensions.show(5)
print(f'categorical columns processed in {time.time() - start} seconds.')

+-----+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+
|label|int_feature_1|int_feature_2|int_feature_3|int_feature_4|int_feature_5|int_feature_6|int_feature_7|int_feature_8|int_feature_9|int_feature_10|int_feature_11|int_feature_12|int_feature_13|cate_feature_1|cate_feature_2|cate_feature_3|cate_feature_4|cate_feature_5|cate_feature_6|cate_feature_7|cate_feature_8|cate_feature_9|cate_feature_10|cate_feature_11|cate_feature_12|cate_feature_13|cate_fe

## Numeric Variables

In [15]:
for col in intFieldNames:
    train_parquet_reduced_dimensions = train_parquet_reduced_dimensions.withColumn(col, F.floor(F.log(F.col(col) + 1)))

In [16]:
start = time.time()
train_parquet_reduced_dimensions.show()
print(f'... completed job in {time.time() - start} seconds.')

+-----+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+
|label|int_feature_1|int_feature_2|int_feature_3|int_feature_4|int_feature_5|int_feature_6|int_feature_7|int_feature_8|int_feature_9|int_feature_10|int_feature_11|int_feature_12|int_feature_13|cate_feature_1|cate_feature_2|cate_feature_3|cate_feature_4|cate_feature_5|cate_feature_6|cate_feature_7|cate_feature_8|cate_feature_9|cate_feature_10|cate_feature_11|cate_feature_12|cate_feature_13|cate_fe

## Feature Hashing

In [17]:
n_features = 50000
n_fields = len(intFieldNames) + len(cateFieldNames)

In [18]:
from pyspark.ml.feature import FeatureHasher
hasher = FeatureHasher()
hasher.setCategoricalCols(intFieldNames)
hasher.setNumFeatures(n_features)

for col in intFieldNames + cateFieldNames:
    hasher.setInputCols([col])
    hasher.setOutputCol(col+"_hashed")
    train_parquet_reduced_dimensions = hasher.transform(train_parquet_reduced_dimensions)

In [19]:
start = time.time()
train_parquet_reduced_dimensions.show(2)
print(f'... completed job in {time.time() - start} seconds.')

+-----+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------

In [20]:
hashed_columns = train_parquet_reduced_dimensions.schema.names[-n_fields:]

## Adjust the dataframe to only contain the hashed value

In [21]:
def parse_sparse_vectors(vector, field_ind):
    if vector.indices.size > 0:
        return int(vector.indices[0])
    else:
        return None

vector_parser = F.udf(parse_sparse_vectors, types.IntegerType())

In [22]:
train_parquet_hashed = train_parquet_reduced_dimensions
for field_ind, col in enumerate(hashed_columns):
    
    train_parquet_hashed = train_parquet_hashed.withColumn(col, vector_parser(col, F.lit(field_ind)))

train_parquet_hashed = train_parquet_hashed.drop(*(intFieldNames + cateFieldNames))

In [23]:
start = time.time()
train_parquet_hashed.show(1)
print(f'... completed job in {time.time() - start} seconds.')

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+
|label|int_feature_1_hashed|int_feature_2_hashed|int_feature_3_hashed|int_feature_4_hashed|int_feature_5_hashed|int_feature_6_

In [24]:
# Change labels to be -1 and 1
train_parquet_hashed = train_parquet_hashed.withColumn("label", F.when(F.col("label") == 0, -1).otherwise(F.col("label")))

In [135]:
train_parquet_hashed.take(1)

[Row(label='-1', int_feature_1_hashed=None, int_feature_2_hashed=170820, int_feature_3_hashed=None, int_feature_4_hashed=100679, int_feature_5_hashed=127024, int_feature_6_hashed=None, int_feature_7_hashed=42737, int_feature_8_hashed=59529, int_feature_9_hashed=81135, int_feature_10_hashed=None, int_feature_11_hashed=128649, int_feature_12_hashed=187498, int_feature_13_hashed=52881, cate_feature_1_hashed=188201, cate_feature_2_hashed=191359, cate_feature_3_hashed=169906, cate_feature_4_hashed=199376, cate_feature_5_hashed=184640, cate_feature_6_hashed=175660, cate_feature_7_hashed=85091, cate_feature_8_hashed=27171, cate_feature_9_hashed=86563, cate_feature_10_hashed=69215, cate_feature_11_hashed=128094, cate_feature_12_hashed=187894, cate_feature_13_hashed=71219, cate_feature_14_hashed=111095, cate_feature_15_hashed=140079, cate_feature_16_hashed=197102, cate_feature_17_hashed=20385, cate_feature_18_hashed=90672, cate_feature_19_hashed=172179, cate_feature_20_hashed=175247, cate_featu

## Model

### FFM
Mathematically, FMM can be expressed as:

$ \phi_{FFM}(w, x) = \sum\limits^{n}_{j_1=1} \sum\limits^{n}_{j_2=j_1+1}(w_{j_1} \cdot w_{j_2})x_{j_1}x_{j_2}$

However, in the models considered, either all variables are categorical or all integer values are binned effectively making them categorical and the $x_{j_1}$ and $x_{j_2}$ are both equal to 1. This reduces the formula to:

$ \phi_{FFM}(w, x) = \sum\limits^{n}_{j_1=1} \sum\limits^{n}_{j_2=j_1+1}(w_{j_1} \cdot w_{j_2})$

The optimization function considered for this model is log loss with regularization and the following formula is to be minimized.

$\underset{w}{min}$   $\dfrac{\lambda}{2}||w||_2^2 + \sum\limits^{m}_{i=1}log(1 + exp(-y_i\phi_{FFM}(w,x_i)))$

Currently, a closed-form solution for minimizing log loss is not known and therefore gradient descent is applied. The gradients for $\phi_{FFM}(w, x)$ are:

$g_{j_1,f_2} = \triangledown_{w_{j_1,f_2}} f(w) = \lambda \cdot w_{j_1,f_2} + \kappa \cdot w_{j_2,f_1}$

$g_{j_2,f_1} = \triangledown_{w_{j_2,f_1}} f(w) = \lambda \cdot w_{j_2,f_1} + \kappa \cdot w_{j_1,f_2}$

where,

$\kappa = \dfrac{\partial log(1 + exp(-y\phi_{FFM}(w,x)))}{\partial \phi_{FFM}(w, x)} = \dfrac{-y}{1 + exp(y\phi_{FFM}(w,x))}$

Initially we define two helper function for $\phi_{FFM}$ and $\kappa$

In [25]:
def phi(x):
    total = 0
    for i in range(len(x) - 1):
        if not x[i]:
            continue
            
        for j in range(i + 1, len(x)):
            if x[j]:
                total += np.dot(W[x[i], j, :], W[x[j], i, :])
                            
    return total

def kappa(y, features):
    return -int(y)/(1 + np.exp(y*phi(features)))

### Gradient Descent
This initial model is simply FFM with gradient descent without the regularization term in the optimization function.

In [26]:
train_parquet_hashed = train_parquet_hashed.cache()

In [27]:
# Initialize model parameters
k = 10
n_features = 50000
n_fields = 39
eta = 0.4
reg_c = 0.1
sc.broadcast(k)
sc.broadcast(n_features)
sc.broadcast(n_fields)
sc.broadcast(reg_c)
sc.broadcast(eta)

# initialize 
np.random.seed(1)
W = np.random.uniform(0, 1/np.sqrt(k), size=(n_features, n_fields, k))

In [31]:
def gradient(x):
    y = int(x[0])
    features = x[1:]
    kap = kappa(y, features)
    
#     gradients = np.zeros(shape=(n_features, n_fields, k))
    
    for i in range(len(features) - 1):
        if not features[i]:
            continue
            
        for j in range(i+1, len(features)):
            if features[j]:
                yield ((features[i], j), (kap * W_1.value[features[j], i, :], 1))
                yield ((features[j], i), (kap * W[features[i], j, :], 1))
            
    

In [32]:
import sys
def log_loss(dataRDD, W):
    return dataRDD.map(lambda x: np.log(1 + np.exp(-int(x[0]) * phi(x[1:])))).mean()

def gd_update(dataRDD, W):
    W_1 = sc.broadcast(W)
    grad = dataRDD.flatMap(lambda x: gradient(x))\
                .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\
                .map(lambda x: ((x[0][0], x[0][1]), x[1][0] / x[1][1]))\
                .collect()
    
    print(f"The size of grad is: {sys.getsizeof(grad)}")
    
    grad_update = np.zeros(shape=(n_features, n_fields, k))
    print(f"The size of W is: {sys.getsizeof(W)}")
    print(f"The size of grad_update is: {sys.getsizeof(grad_update)}")
    
    for indices, vector in grad:
        feature_index = indices[0]
        field_index = indices[1]
        
        grad_update[feature_index, field_index, :] += vector
    
    new_model = W - eta * grad_update
    print(f"The size of new_model is: {sys.getsizeof(new_model)}")
    
    return new_model

In [33]:
train_rdd = train_parquet_hashed.rdd
n_steps = 10
start = time.time()
for i in range(n_steps):
    print("----------")
    print(f"STEP: {i+1}")
    W = gd_update(train_rdd, W)
    loss = log_loss(train_rdd, W)
    print(f"Loss: {loss}")
print(f"\n... trained {n_steps} iterations in {time.time() - start} seconds")

----------
STEP: 1


KeyboardInterrupt: 

In [26]:
import sys
sys.getsizeof(np.zeros(shape=(200000,39,10)))

624000128

# Gradient Descent with Regularization

In [120]:
def gradient(x):
    y = int(x[0])
    features = x[1:]
    kap = kappa(y, features)
    
    for i in range(len(features) - 1):
        if not features[i]:
            continue
            
        for j in range(i+1, len(features)):
            if features[j]:
                yield ((features[i], j), (kap * W[features[j], i, :], 1))
                yield ((features[j], i), (kap * W[features[i], j, :], 1))

def gd_update(dataRDD, W):
    grad = dataRDD.flatMap(lambda x: gradient(x))\
                .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\
                .map(lambda x: ((x[0][0], x[0][1]), x[1][0] / x[1][1]))\
                .collect()
    
    grad_wReg = reg_c * W
    
    for indices, vector in grad:
        feature_index = indices[0]
        field_index = indices[1]
        
        grad_wReg[feature_index, field_index, :] = grad_wReg[feature_index, field_index, :] + vector
    
    new_model = W - eta * grad_wReg
    
    return new_model

n_steps = 10
for i in range(n_steps):
    print("----------")
    print(f"STEP: {i+1}")
    W = gd_update(train_rdd, W)
    loss = log_loss(train_rdd, W)
    print(f"Loss: {loss}")

----------
STEP: 1
Loss: 36.63388183416249
----------
STEP: 2
Loss: 0.9662416266789928
----------
STEP: 3
Loss: 2.728638743381767
----------
STEP: 4
Loss: 6.242327608563378
----------
STEP: 5
Loss: 2.0142464418798687
----------
STEP: 6
Loss: 0.23500716835922963
----------
STEP: 7
Loss: 0.17160648606285098
----------
STEP: 8
Loss: 0.14438658466387488
----------
STEP: 9
Loss: 0.1314368482610588
----------
STEP: 10
Loss: 0.12581668399350515


# Cross Validation

# Playground

In [3]:
sample_data = spark.read.csv("sample.txt", sep="\t")
sample_data.write.format("parquet").save("sample.parquet")

In [6]:
sample_df = spark.read.parquet("sample.parquet")

In [7]:
sample_df.show()

+---+---+-----+------+
|_c0|_c1|  _c2|   _c3|
+---+---+-----+------+
|  1| 10| ESPN|  Nike|
|  1| 15| ESPN|  Nike|
|  0|  2| ESPN| Gucci|
|  1| 10| ESPN|Adidas|
|  1| 10| ESPN|Adidas|
|  0|  3|Vogue|  Nike|
|  1| 20|Vogue| Gucci|
|  0|  5|Vogue|Adidas|
|  1| 50|  NBC|  Nike|
|  0|  0|  NBC| Gucci|
|  0|  4|  NBC|Adidas|
|  0|  4|  NBC|Adidas|
+---+---+-----+------+



In [17]:
def feature_hash(x, modulo=10**6):
    """
    A function that can be used to hash the features in each observation in the RDD. 
    We replace the label with 1, -1 and we hash all other features using sha256 
    and then we take modulo some power of 10. 
    """
    print(x)
#     x[0] = 2*int(x[0]) - 1
#     for i, value in enumerate(x[1:], 1):
#         h = sha256("{i}-{val}".format(i=i,val=value).encode('ascii'))
#         hashed_value = int(h.hexdigest(), base=16) 
#         hashed_value_mod = hashed_value % modulo
#         x[i] = hashed_value_mod
#     return x

# from pyspark.sql.functions import col
# sample_df.select(*(feature_hash(col(c)).alias(c) for c in sample_df.columns)).show()
sample_df.withColumn("_c0", feature_hash(sample_df["_c0"]))

Column<b'_c0'>


AssertionError: col should be Column

# Running Full Models

In [311]:
train_data = spark.read.csv("data/dac/train.txt", sep="\t")
train_data.write.format("parquet").save(f"data/dac/train.parquet")
full_rdd = sc.textFile('data/dac/train.txt')
train_rdd, test_rdd = full_rdd.randomSplit([0.8,0.2], seed = 2018)

In [47]:
train_parquet = spark.read.parquet("data/dac/train.parquet")

In [48]:
from pyspark.sql import types

oldColNames = train_parquet.schema.names
train_parquet = train_parquet.withColumn("label", train_parquet["_c0"])
for colNum in range(1,14): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("int_feature_"+ str(colNum), train_parquet[colName].cast(types.IntegerType()))
for colNum in range(14,40): 
    colName = "_c" + str(colNum)
    train_parquet = train_parquet.withColumn("cate_feature_"+ str(colNum-13), train_parquet[colName])

#drop the old columns
train_parquet = train_parquet.drop(*oldColNames)

In [49]:
n_features = 100
n_fields = 3

In [50]:
intFieldNames = [colName for colName, dType in train_parquet.dtypes if dType == 'int']
cateFieldNames = [colName for colName, dType in train_parquet.dtypes if dType == 'string' and colName != 'label']

In [51]:
from pyspark.ml.feature import FeatureHasher
hasher = FeatureHasher()
hasher.setCategoricalCols(intFieldNames)
hasher.setNumFeatures(n_features)

# for col in intFieldNames + cateFieldNames:
hasher.setInputCols(intFieldNames + cateFieldNames)
hasher.setOutputCol("hashed_features")
train_parquet = hasher.transform(train_parquet)

In [53]:
train_parquet.show(1)

+-----+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+--------------------+
|label|int_feature_1|int_feature_2|int_feature_3|int_feature_4|int_feature_5|int_feature_6|int_feature_7|int_feature_8|int_feature_9|int_feature_10|int_feature_11|int_feature_12|int_feature_13|cate_feature_1|cate_feature_2|cate_feature_3|cate_feature_4|cate_feature_5|cate_feature_6|cate_feature_7|cate_feature_8|cate_feature_9|cate_feature_10|cate_feature_11|cate_feature_12|ca

In [56]:
train_parquet.select("int_feature_1").collect()

Py4JJavaError: An error occurred while calling o2531.collectToPython.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:282)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:276)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.foreach(SparkPlan.scala:276)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:298)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:297)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3195)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [338]:
for col in intFieldNames + cateFieldNames:

['cate_feature_1',
 'cate_feature_2',
 'cate_feature_3',
 'cate_feature_4',
 'cate_feature_5',
 'cate_feature_6',
 'cate_feature_7',
 'cate_feature_8',
 'cate_feature_9',
 'cate_feature_10',
 'cate_feature_11',
 'cate_feature_12',
 'cate_feature_13',
 'cate_feature_14',
 'cate_feature_15',
 'cate_feature_16',
 'cate_feature_17',
 'cate_feature_18',
 'cate_feature_19',
 'cate_feature_20',
 'cate_feature_21',
 'cate_feature_22',
 'cate_feature_23',
 'cate_feature_24',
 'cate_feature_25',
 'cate_feature_26']

In [309]:
sample_hashed = train_rdd.map(lambda x: feature_hash(x, 100000))
sample_hashed.take(1)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2599.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2599.0 (TID 5158, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda/lib/python3.6/site-packages/pyspark-2.3.1-py3.6.egg/pyspark/rdd.py", line 1371, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-309-ecdd36739116>", line 1, in <lambda>
  File "<ipython-input-6-601d2890c953>", line 8, in feature_hash
TypeError: 'str' object does not support item assignment

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:149)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:149)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:149)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor296.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda/lib/python3.6/site-packages/pyspark-2.3.1-py3.6.egg/pyspark/rdd.py", line 1371, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-309-ecdd36739116>", line 1, in <lambda>
  File "<ipython-input-6-601d2890c953>", line 8, in feature_hash
TypeError: 'str' object does not support item assignment

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:149)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:149)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


# Model Assessment
We then consider how well the model is performing on the training vs test set to check if the model is tending to overfit.

In [240]:
sample_hashed.take(1)

[[1, 1, 16, 4]]

In [241]:
import itertools
list(itertools.combinations([1, 16, 41], 2))

[(1, 16), (1, 41), (16, 41)]

In [242]:
wj0f1_wj1f0 = np.dot(W[1, 1, :], W[16, 0, :])
wj0f2_wj2f0 = np.dot(W[1, 2, :], W[4, 0, :])
wj1f2_wj2f1 = np.dot(W[16, 2, :], W[4, 1, :])
total = wj0f1_wj1f0 + wj0f2_wj2f0 + wj1f2_wj2f1
print(f"Expected value is the sum of these three: {total}")

Expected value is the sum of these three: 0.7879988184229724


In [243]:
sample_hashed.map(lambda x: phi(x[1:])).collect()[0]

0.7879988184229724

In [256]:
sys.getsizeof(np.random.uniform(0,1,size=(10000000,10)))

800000112

# Extras to consider
## Develop classes for the model

In [None]:
# class FFM:
#     def __init__(self, n_features, k = 10, eta = 0.1, reg_c = 0.1):
#         self.n_features = n_features
#         self.k = k
#         self.eta = eta
#         self.reg_c = reg_c
        
        
# ffm = FFM(25, k=3)
# ffm.n_features

### Notes
- We develop the paper this way to mimic homework from throughout the semester and build the model up sequentially
- Should we write tests and show them in the presentation for simple function like $\phi_{FFM}$