In [28]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


import sys 
sys.path.insert(1, "../")
from workloads.util import use_results, use_dataset, read_config, log_dataset

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
dataset_dir = use_dataset("ml-100k")

/data/wooders/ralf-vldb//datasets/ml-100k


In [76]:
ratings_path = f"{dataset_dir}/u.data"
df = pd.read_csv(ratings_path, sep = '\t', names = ["user_id", "movie_id", "rating", "timestamp"]).sort_values('timestamp')
df

Unnamed: 0,user_id,movie_id,rating,timestamp
214,259,255,4,874724710
83965,259,286,4,874724727
43027,259,298,4,874724754
21396,259,185,4,874724781
82655,259,173,4,874724843
...,...,...,...,...
46773,729,689,4,893286638
73008,729,313,3,893286638
46574,729,328,3,893286638
64312,729,748,4,893286638


In [77]:
#users = list(set(df.user_id.tolist()))
#user_start_ts = {user: df[df["user_id"] == user].timestamp.min() for user in users}
#df.timestamp = df.apply(lambda x: x["timestamp"] - user_start_ts[x["user_id"]], axis=1)

In [78]:
df.timestamp.max() - df.timestamp.min() 

18561928

In [79]:
df[df["user_id"] == 148].timestamp.max() - df[df["user_id"] == 148].timestamp.min()

385088

In [9]:
squash_amount = 100

In [39]:
new_timestamp = [i // squash_amount for i in range(1, len(df)+1)]

In [17]:
new_df = df.drop('timestamp', 1)
new_df['timestamp'] = new_timestamp

In [59]:
new_df[new_df["user_id"] == 148]

Unnamed: 0,user_id,movie_id,rating,timestamp
99336,148,194,5,125
33589,148,496,3,125
72245,148,174,5,125
12982,148,1039,2,125
46538,148,168,5,125
...,...,...,...,...
7784,148,473,5,142
99511,148,127,1,142
32257,148,151,4,142
7203,148,1012,4,142


## Train PySpark ALS Model 

In [19]:
def train_stream_test_split(df):
    columns = ["user_id", "movie_id", "rating", "timestamp"]
    als_data = []
    stream_data = []
    for user_id in df["user_id"].unique():
        user_df = df[df["user_id"] == user_id].sort_values("timestamp")
        count = 0
        for row in user_df.itertuples():
            data = [row.user_id, row.movie_id, row.rating, row.timestamp]
            if count < len(user_df) / 2:
                als_data.append(data)
            else:
                stream_data.append(data)
            count += 1
    als_df = pd.DataFrame(data=als_data, columns=columns).sort_values("timestamp")
    movie_list = set(als_df['movie_id'])
    stream_df = pd.DataFrame(data=stream_data, columns=columns).sort_values("timestamp")
    # Drop movies not seen before from stream
    stream_df = stream_df.drop(stream_df[stream_df['movie_id'].map(lambda x: x not in movie_list)].index)
    test_df = stream_df.copy()
    return als_df, stream_df, test_df

In [20]:
als_df, stream_df, test_df = train_stream_test_split(new_df)

In [21]:
spark = SparkSession.builder.master('local').appName('als').getOrCreate()

In [22]:
spark_als_df = spark.createDataFrame(als_df) 
spark_stream_df = spark.createDataFrame(stream_df)

In [24]:
als = ALS(
         userCol="user_id", 
         itemCol="movie_id",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop",
         rank=150,
         maxIter=10,
         regParam=.1
)

In [25]:
model=als.fit(spark_als_df)

In [26]:
predictions = model.transform(spark_stream_df)
evaluator = RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.9815395865331437

## Save Features CSV

In [34]:
name = "ml-100k-features"
dataset_dir = read_config()["dataset_dir"] + "/" + name
dataset_dir

'/data/wooders/ralf-vldb//datasets/ml-100k-features'

In [36]:
model.itemFactors.toPandas().sort_values('id').to_csv(f'{dataset_dir}/movie_features.csv', index = False)
model.userFactors.toPandas().sort_values('id').to_csv(f'{dataset_dir}/user_features.csv', index = False)

In [37]:
stream_df = stream_df.sort_values('timestamp')
new_timestamp = [i // squash_amount for i in range(1, len(stream_df)+1)]
stream_df['timestamp'] = new_timestamp
print(len(stream_df))
stream_df.to_csv(f'{dataset_dir}/ratings.csv', index = False)

49442


In [38]:
log_dataset(name)

vldb/datasets ml-100k-features /ml-100k-features/ratings.csv
uploading /data/wooders/ralf-vldb//datasets/ml-100k-features vldb/datasets vldb/datasets/ml-100k-features/ratings.csv
vldb/datasets ml-100k-features /ml-100k-features/user_features.csv
uploading /data/wooders/ralf-vldb//datasets/ml-100k-features vldb/datasets vldb/datasets/ml-100k-features/user_features.csv
vldb/datasets ml-100k-features /ml-100k-features/movie_features.csv
uploading /data/wooders/ralf-vldb//datasets/ml-100k-features vldb/datasets vldb/datasets/ml-100k-features/movie_features.csv


'vldb/datasets/ml-100k-features'

In [41]:
stream_df[stream_df["user_id"] == 148]

Unnamed: 0,user_id,movie_id,rating,timestamp
8936,148,191,1,48
8938,148,169,5,48
8940,148,8,4,48
8941,148,501,4,48
8937,148,234,3,48
8939,148,596,5,48
8942,148,132,4,49
8963,148,181,5,55
8946,148,408,5,55
8962,148,1012,4,55
