In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import pandas as pd

In [19]:
# Import the dataset
df = pd.read_csv('data/collab_filter_100k.csv')
collab = df.drop(columns=['streamId'])
collab = collab.rename(columns={'streamerId':'item_id','interactionTime':'rating', 'userId': 'user_id'})
collab.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  5 non-null      int64  
 1   rating   5 non-null      float64
 2   item_id  5 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 248.0 bytes


In [13]:
# Create into spark dataframe and split
conf = SparkConf().setAppName('app').setMaster('local[4]')
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

ratings = sql_context.createDataFrame(collab)
#(training, test) = ratings.randomSplit([0.8,0.2], 38)



In [22]:
# Need to transform columns so that they can work with ALS limits to int64
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='item_id', outputCol='item_id_str')

In [23]:
# Transform dataframe with indexer
from pyspark.sql.functions import col

transformed = (indexer.fit(ratings)
               .transform(ratings)
               .withColumn("item_id_str", col("item_id_str").cast("string"))
              .select("user_id", "item_id_str", "rating"))

In [25]:
# Covert to RDD
from pyspark.mllib.recommendation import Rating
ratings_rdd = transformed.rdd.map(lambda r: Rating(r.user_id, r.item_id_str, r.rating))

In [31]:
# Split the rdd
(training, test) = transformed.randomSplit([0.8,0.2], 38)

In [32]:
training

DataFrame[user_id: bigint, item_id_str: string, rating: double]

In [33]:
# Build model using ALS
als = ALS(maxIter=5, regParam=0.01, userCol='user_id', itemCol="item_id_str", ratingCol="rating", coldStartStrategy='drop')

In [34]:
# Train the model
model = als.fit(training)

IllegalArgumentException: requirement failed: Column item_id_str must be of type numeric but was actually of type string.