<a href="https://colab.research.google.com/github/dinarsadykow/cv_hw/blob/main/kaggle_OTTO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OTTO – Multi-Objective Recommender System
https://www.kaggle.com/competitions/otto-recommender-system/data

In [15]:
# # ------------------------------------------------------
# # Kaggle Download Data

# import json

# !mkdir ~/.kaggle
# !touch ~/.kaggle/kaggle.json

# # api_token = {"username":"123","key":"456"}
# api_token = json.load( open('/content/drive/MyDrive/kaggle.json','r') )

# with open('/root/.kaggle/kaggle.json', 'w') as file:
#     json.dump(api_token, file)

# !chmod 600 ~/.kaggle/kaggle.json

# !kaggle competitions download -c otto-recommender-system
# !unzip /content/otto-recommender-system.zip

# # ------------------------------------------------------
# # pySpark Tutorial from https://towardsdatascience.com/pyspark-on-google-colab-101-d31830b238be

# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
# !tar xf spark-3.1.2-bin-hadoop2.7.tgz
# !pip install -q findspark

# !pip install pickle5
# !pip install pyspark_dist_explore

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 --executor-memory 4g --driver-memory 3g pyspark-shell'

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

spark = (
      SparkSession.builder\
        .master("local")\
        .appName("sdr")\
        .config('spark.ui.port', '4050')\
        .config("spark.driver.maxResultSize", "4g")\
        #.config("spark.sql.crossJoin.enabled", "true")\
        .getOrCreate()
      )

In [2]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import VectorUDT

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

import gc
import re
from datetime import datetime

import pickle5 as pickle
import json

from pyspark_dist_explore import hist
import matplotlib.pyplot as plt

# Files

### - train.jsonl - the training data, which contains full session data

- session - the unique session id
- events - the time ordered sequence of events in the session
-- aid - the article id (product code) of the associated event
-- ts - the Unix timestamp of the event
-- type - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session


### - test.jsonl - the test data, which contains truncated session data
your task is to predict the next aid clicked after the session truncation, as well as the the remaining aids that are added to carts and orders; you may predict up to 20 values for each 
session type


### - sample_submission.csv - a sample submission file in the correct format

# Load Train Data

In [None]:
sDF_train = spark.read.json( 'train.jsonl' )
sDF_train = ( sDF_train.select(
                        'session' 
                        , 'events.aid'
                        , 'events.ts'
                        , 'events.type'
                      )
              )

sDF_train.printSchema()
sDF_train.show()

root
 |-- session: long (nullable = true)
 |-- aid: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ts: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- type: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+--------------------+--------------------+--------------------+
|session|                 aid|                  ts|                type|
+-------+--------------------+--------------------+--------------------+
|      0|[1517085, 1563459...|[1659304800025, 1...|[clicks, clicks, ...|
|      1|[424964, 1492293,...|[1659304800025, 1...|[carts, clicks, c...|
|      2|[763743, 137492, ...|[1659304800038, 1...|[clicks, clicks, ...|
|      3|[1425967, 1425967...|[1659304800095, 1...|[carts, clicks, c...|
|      4|[613619, 298827, ...|[1659304800119, 1...|[clicks, clicks, ...|
|      5|[1098089, 1354785...|[1659304800133, 1...|[clicks, clicks, ...|
|      6|[137164, 956148, ...|[1659304800134, 1...|[clicks

In [None]:
def SaveReadParquet( sparkDF_input
                    , f_name = 'sDF_example_name'
                    #, my_dir = '/content/drive/MyDrive/Colab Notebooks/Kaggle/' 
                    , my_dir = '/content/'
                    ):
    write_name = f'{my_dir}{f_name}.parquet'
    sparkDF_input.write.mode('overwrite').parquet( write_name )
    print( write_name )
    return spark.read.parquet( write_name )

In [None]:
%%time

sDF_train2 = sDF_train\
    .select( 'session' , F.posexplode('aid') ) \
    .withColumnRenamed('col','aid')
    
sDF_train2.show()

+-------+---+-------+
|session|pos|    aid|
+-------+---+-------+
|      0|  0|1517085|
|      0|  1|1563459|
|      0|  2|1309446|
|      0|  3|  16246|
|      0|  4|1781822|
|      0|  5|1152674|
|      0|  6|1649869|
|      0|  7| 461689|
|      0|  8| 305831|
|      0|  9| 461689|
|      0| 10| 362233|
|      0| 11|1649869|
|      0| 12|1649869|
|      0| 13| 984597|
|      0| 14|1649869|
|      0| 15| 803544|
|      0| 16|1110941|
|      0| 17|1190046|
|      0| 18|1760685|
|      0| 19| 631008|
+-------+---+-------+
only showing top 20 rows

CPU times: user 4.69 ms, sys: 2.18 ms, total: 6.87 ms
Wall time: 737 ms


In [None]:
%%time

sDF_train2 = SaveReadParquet(sDF_train2,'sDF_train2')

/content/sDF_train2.parquet
CPU times: user 1.42 s, sys: 150 ms, total: 1.57 s
Wall time: 8min 3s


In [None]:
type_labels_dic = {'clicks':0, 'carts':1, 'orders':2}

@F.udf( T.StringType() )
def type_labels(x):
    return type_labels_dic[x]

sDF_train4 = sDF_train\
    .select( 'session' , F.posexplode('type') ) \
    .withColumnRenamed( 'col', 'type' ) \
    .withColumn( 'type', type_labels(F.col('type') ) )

sDF_train4.show()

+-------+---+----+
|session|pos|type|
+-------+---+----+
|      0|  0|   0|
|      0|  1|   0|
|      0|  2|   0|
|      0|  3|   0|
|      0|  4|   0|
|      0|  5|   0|
|      0|  6|   1|
|      0|  7|   1|
|      0|  8|   2|
|      0|  9|   2|
|      0| 10|   0|
|      0| 11|   0|
|      0| 12|   0|
|      0| 13|   0|
|      0| 14|   0|
|      0| 15|   0|
|      0| 16|   0|
|      0| 17|   0|
|      0| 18|   0|
|      0| 19|   0|
+-------+---+----+
only showing top 20 rows



In [None]:
%%time

sDF_train4 = SaveReadParquet(sDF_train4,'sDF_train4')

/content/sDF_train4.parquet
CPU times: user 2.74 s, sys: 278 ms, total: 3.02 s
Wall time: 16min 43s


In [None]:
%%time

sDF_train3 = sDF_train\
    .select( 'session' , F.posexplode('ts') ) \
    .withColumnRenamed('col','ts') \
    .withColumn('ts', F.col('ts') / F.lit(1000) ) \
    .withColumn('ts', F.round( F.col('ts') ) )

sDF_train3.printSchema()

root
 |-- session: long (nullable = true)
 |-- pos: integer (nullable = false)
 |-- ts: double (nullable = true)

CPU times: user 4.7 ms, sys: 2.36 ms, total: 7.05 ms
Wall time: 126 ms


In [None]:
%%time

sDF_train3 = SaveReadParquet(sDF_train3,'sDF_train3')

/content/sDF_train3.parquet
CPU times: user 1.63 s, sys: 148 ms, total: 1.78 s
Wall time: 9min 34s


In [None]:
%%time

sDF_train5 = sDF_train2 \
    .join( sDF_train3, on=['session','pos'],how='inner') \
    .join( sDF_train4, on=['session','pos'],how='inner') 

sDF_train5.printSchema()

root
 |-- session: long (nullable = true)
 |-- pos: integer (nullable = true)
 |-- aid: long (nullable = true)
 |-- ts: double (nullable = true)
 |-- type: string (nullable = true)

CPU times: user 9.4 ms, sys: 0 ns, total: 9.4 ms
Wall time: 144 ms


In [None]:
%%time
    
sDF_train5 = SaveReadParquet( sDF_train5, 'sDF_train5' )

In [None]:
sDF_train5.show()

+-------+---+-------+-------------+----+
|session|pos|    aid|           ts|type|
+-------+---+-------+-------------+----+
|      0|  6|1649869|1.659369894E9|   1|
|      0|112|1443747|1.660628405E9|   0|
|      0|203| 974651|1.661550342E9|   1|
|      0|268| 543308|1.661682228E9|   0|
|      3| 16|1089061| 1.65939068E9|   0|
|      6| 47|1479126|    1.65935E9|   0|
|      6| 73|1747070|1.659352644E9|   0|
|      6|100|  60590|1.659391566E9|   0|
|      7|  4|1727444|1.659304888E9|   0|
|     14| 15| 354050|1.659306059E9|   0|
|     14| 84| 588838|1.659531835E9|   0|
|     14|180| 311992|1.659568261E9|   0|
|     14|349| 494715|1.661025531E9|   0|
|     19| 91|1312778|1.659646045E9|   0|
|     21| 21| 482707|1.659308077E9|   1|
|     21|204| 484647|1.659709127E9|   0|
|     21|305|1781373|1.660486968E9|   0|
|     21|312|1842105|1.660566916E9|   0|
|     21|363| 927708| 1.66076356E9|   0|
|     23|  8| 826995|1.659842156E9|   1|
+-------+---+-------+-------------+----+
only showing top

In [25]:
sDF_train5.count()

216716096

# Load Test Data

In [27]:
# sDF_test = spark.read.json( 'test.jsonl' )
sDF_test = ( sDF_test.select(
                        'session' 
                        , 'events.aid'
                        , 'events.ts'
                        , 'events.type'
                      )
              )

sDF_test.printSchema()
sDF_test.show()

root
 |-- session: long (nullable = true)
 |-- aid: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ts: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- type: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------+--------------------+--------------------+--------------------+
| session|                 aid|                  ts|                type|
+--------+--------------------+--------------------+--------------------+
|12899779|             [59625]|     [1661724000278]|            [clicks]|
|12899780|[1142000, 582732,...|[1661724000378, 1...|[clicks, clicks, ...|
|12899781|[141736, 199008, ...|[1661724000559, 1...|[clicks, clicks, ...|
|12899782|[1669402, 1494780...|[1661724000568, 1...|[clicks, clicks, ...|
|12899783|[255297, 1114789,...|[1661724000572, 1...|[clicks, clicks, ...|
|12899784|[1036375, 1269952...|[1661724000604, 1...|[clicks, clicks, ...|
|12899785|[1784451, 1169631...|[1661724000809, 1.

In [28]:
%%time

sDF_test2 = sDF_test\
    .select( 'session' , F.posexplode('aid') ) \
    .withColumnRenamed('col','aid')
    
sDF_test2.show()

+--------+---+-------+
| session|pos|    aid|
+--------+---+-------+
|12899779|  0|  59625|
|12899780|  0|1142000|
|12899780|  1| 582732|
|12899780|  2| 973453|
|12899780|  3| 736515|
|12899780|  4|1142000|
|12899781|  0| 141736|
|12899781|  1| 199008|
|12899781|  2|  57315|
|12899781|  3| 194067|
|12899781|  4| 199008|
|12899781|  5| 199008|
|12899781|  6| 199008|
|12899781|  7| 199008|
|12899781|  8| 199008|
|12899781|  9| 199008|
|12899781| 10| 918667|
|12899782|  0|1669402|
|12899782|  1|1494780|
|12899782|  2|1494780|
+--------+---+-------+
only showing top 20 rows

CPU times: user 3.35 ms, sys: 2.16 ms, total: 5.51 ms
Wall time: 295 ms


In [29]:
%%time
    
sDF_test2 = SaveReadParquet( sDF_test2, 'sDF_test2' )

/content/sDF_test2.parquet
CPU times: user 115 ms, sys: 15.5 ms, total: 130 ms
Wall time: 18.6 s


In [30]:
%%time

sDF_test3 = sDF_test\
    .select( 'session' , F.posexplode('ts') ) \
    .withColumnRenamed('col','ts') \
    .withColumn('ts', F.col('ts') / F.lit(1000) ) \
    .withColumn('ts', F.round( F.col('ts') ) )

sDF_test3.printSchema()

root
 |-- session: long (nullable = true)
 |-- pos: integer (nullable = false)
 |-- ts: double (nullable = true)

CPU times: user 8.24 ms, sys: 3.52 ms, total: 11.8 ms
Wall time: 74 ms


In [31]:
%%time
    
sDF_test3 = SaveReadParquet( sDF_test3, 'sDF_test3' )

/content/sDF_test3.parquet
CPU times: user 152 ms, sys: 22.4 ms, total: 174 ms
Wall time: 25.3 s


In [32]:
type_labels_dic = {'clicks':0, 'carts':1, 'orders':2}

@F.udf( T.StringType() )
def type_labels(x):
    return type_labels_dic[x]

sDF_test4 = sDF_test\
    .select( 'session' , F.posexplode('type') ) \
    .withColumnRenamed( 'col', 'type' ) \
    .withColumn( 'type', type_labels(F.col('type') ) )

sDF_test4.show()

+--------+---+----+
| session|pos|type|
+--------+---+----+
|12899779|  0|   0|
|12899780|  0|   0|
|12899780|  1|   0|
|12899780|  2|   0|
|12899780|  3|   0|
|12899780|  4|   0|
|12899781|  0|   0|
|12899781|  1|   0|
|12899781|  2|   0|
|12899781|  3|   0|
|12899781|  4|   0|
|12899781|  5|   0|
|12899781|  6|   1|
|12899781|  7|   0|
|12899781|  8|   0|
|12899781|  9|   0|
|12899781| 10|   0|
|12899782|  0|   0|
|12899782|  1|   0|
|12899782|  2|   0|
+--------+---+----+
only showing top 20 rows



In [33]:
%%time
    
sDF_test4 = SaveReadParquet( sDF_test4, 'sDF_test4' )

/content/sDF_test4.parquet
CPU times: user 220 ms, sys: 24.3 ms, total: 245 ms
Wall time: 36.2 s


In [34]:
%%time

sDF_test5 = sDF_test2 \
    .join( sDF_test3, on=['session','pos'],how='inner') \
    .join( sDF_test4, on=['session','pos'],how='inner') 

sDF_test5.printSchema()

root
 |-- session: long (nullable = true)
 |-- pos: integer (nullable = true)
 |-- aid: long (nullable = true)
 |-- ts: double (nullable = true)
 |-- type: string (nullable = true)

CPU times: user 5.91 ms, sys: 1.09 ms, total: 7 ms
Wall time: 54.9 ms


In [35]:
%%time
    
sDF_test5 = SaveReadParquet( sDF_test5, 'sDF_test5' )

/content/sDF_test5.parquet
CPU times: user 302 ms, sys: 33.4 ms, total: 335 ms
Wall time: 49.4 s


In [36]:
%%time
    
sDF_test5 = SaveReadParquet( sDF_test5
                        , 'sDF_test5'
                        , my_dir = '/content/drive/MyDrive/Colab Notebooks/Kaggle/'  )

/content/drive/MyDrive/Colab Notebooks/Kaggle/sDF_test5.parquet
CPU times: user 74.6 ms, sys: 18.8 ms, total: 93.4 ms
Wall time: 13.3 s


In [37]:
sDF_test5.show()

+--------+---+-------+-------------+----+
| session|pos|    aid|           ts|type|
+--------+---+-------+-------------+----+
|12899789|  1| 631398|1.661724204E9|   0|
|12899791|  3|1365651| 1.66172411E9|   0|
|12899793|  2|1792644|1.661724057E9|   0|
|12899832|  8|1509329|1.661724668E9|   0|
|12899842|  0|  46318|1.661724013E9|   0|
|12899850| 98|1004556|1.661791542E9|   0|
|12899852| 19| 701401|1.661730833E9|   0|
|12899852|112|  90712|1.661811416E9|   0|
|12899860|  4|1293588|1.661776478E9|   2|
|12899906| 13| 938051|1.661724525E9|   1|
|12899907| 20| 443295|1.661725332E9|   1|
|12899919| 48|1637761|1.662297442E9|   0|
|12899919| 52|1488793|1.662297631E9|   1|
|12899946| 22|1490773|1.661724294E9|   0|
|12899950| 31| 560261|1.661725337E9|   1|
|12900014|  4|1722760|1.661724179E9|   0|
|12900016|  5|1102414|1.661724139E9|   0|
|12900016| 31| 244618|1.661768665E9|   0|
|12900021| 10|1587914|1.661724221E9|   0|
|12900026|  0|1705606|1.661724067E9|   0|
+--------+---+-------+------------

# Load Data from Parquet

In [4]:
%%time

sDF_train5 = spark.read.parquet( 'sDF_train5.parquet' )
sDF_test5 = spark.read.parquet( 'sDF_test5.parquet' )

sDF_train5.show()
sDF_test5.show()

+-------+---+-------+-------------+----+
|session|pos|    aid|           ts|type|
+-------+---+-------+-------------+----+
|      0|  6|1649869|1.659369894E9|   1|
|      0|112|1443747|1.660628405E9|   0|
|      0|203| 974651|1.661550342E9|   1|
|      0|268| 543308|1.661682228E9|   0|
|      3| 16|1089061| 1.65939068E9|   0|
|      6| 47|1479126|    1.65935E9|   0|
|      6| 73|1747070|1.659352644E9|   0|
|      6|100|  60590|1.659391566E9|   0|
|      7|  4|1727444|1.659304888E9|   0|
|     14| 15| 354050|1.659306059E9|   0|
|     14| 84| 588838|1.659531835E9|   0|
|     14|180| 311992|1.659568261E9|   0|
|     14|349| 494715|1.661025531E9|   0|
|     19| 91|1312778|1.659646045E9|   0|
|     21| 21| 482707|1.659308077E9|   1|
|     21|204| 484647|1.659709127E9|   0|
|     21|305|1781373|1.660486968E9|   0|
|     21|312|1842105|1.660566916E9|   0|
|     21|363| 927708| 1.66076356E9|   0|
|     23|  8| 826995|1.659842156E9|   1|
+-------+---+-------+-------------+----+
only showing top

# View Data

In [61]:
# Train Data
val_count = sDF_train4.agg(  F.countDistinct('session').alias('session_cnt') ).collect()[0][0]

sDF_train4 \
    .groupBy('pos').agg(  F.countDistinct('session').alias('session_cnt') ) \
    .orderBy( F.col("session_cnt").desc() ) \
    .withColumn( 'session_share', F.col('session_cnt') / F.lit(val_count) * F.lit(100) ) \
    .withColumn( 'session_share', F.round( F.col('session_share') ) ) \
    .show()

+---+-----------+-------------+
|pos|session_cnt|session_share|
+---+-----------+-------------+
|  1|   12899779|        100.0|
|  0|   12899779|        100.0|
|  2|   10405594|         81.0|
|  3|    8791396|         68.0|
|  4|    7624389|         59.0|
|  5|    6751602|         52.0|
|  6|    6061859|         47.0|
|  7|    5506372|         43.0|
|  8|    5046120|         39.0|
|  9|    4655041|         36.0|
| 10|    4321310|         33.0|
| 11|    4030948|         31.0|
| 12|    3775810|         29.0|
| 13|    3549973|         28.0|
| 14|    3347229|         26.0|
| 15|    3164777|         25.0|
| 16|    3000164|         23.0|
| 17|    2850652|         22.0|
| 18|    2714134|         21.0|
| 19|    2588977|         20.0|
+---+-----------+-------------+
only showing top 20 rows



In [59]:
# Test Data
val_count = sDF_test4.agg(  F.countDistinct('session').alias('session_cnt') ).collect()[0][0]

sDF_test4 \
    .groupBy('pos').agg(  F.countDistinct('session').alias('session_cnt') ) \
    .orderBy( F.col("session_cnt").desc() ) \
    .withColumn( 'session_share', F.col('session_cnt') / F.lit(val_count) * F.lit(100) ) \
    .withColumn( 'session_share', F.round( F.col('session_share') ) ) \
    .show()

+---+-----------+-------------+
|pos|session_cnt|session_share|
+---+-----------+-------------+
|  0|    1671803|        100.0|
|  1|     921704|         55.0|
|  2|     627261|         38.0|
|  3|     466825|         28.0|
|  4|     365488|         22.0|
|  5|     296124|         18.0|
|  6|     246182|         15.0|
|  7|     208236|         12.0|
|  8|     178631|         11.0|
|  9|     155220|          9.0|
| 10|     135808|          8.0|
| 11|     120195|          7.0|
| 12|     106772|          6.0|
| 13|      95584|          6.0|
| 14|      85956|          5.0|
| 15|      77691|          5.0|
| 16|      70553|          4.0|
| 17|      64330|          4.0|
| 18|      58737|          4.0|
| 19|      53963|          3.0|
+---+-----------+-------------+
only showing top 20 rows



In [7]:
shape_train = sDF_train5.count()
shape_test = sDF_test5.count()

( shape_train,
   shape_test,
   shape_test/shape_train )

(216716096, 6928123, 0.03196865912534711)

In [8]:
session_train = sDF_train5.agg(  F.countDistinct('session').alias('session_cnt') ).collect()[0][0]
session_test = sDF_test5.agg(  F.countDistinct('session').alias('session_cnt') ).collect()[0][0]

( session_train,
   session_test,
   session_test/session_train )

(12899779, 1671803, 0.12959935205091497)

In [None]:
%%time

fig, ax = plt.subplots()
hist( ax
      , sDF_train5 \
          .groupBy('session').agg(  F.count('aid').alias('aid_cnt') ) \
          .withColumn( 'aid_cnt_log', F.log( F.col('aid_cnt') ) )
          .select('aid_cnt_log')
      , bins = 20
      , color=['red'])

In [None]:
%%time

fig, ax = plt.subplots()
hist( ax
      , sDF_test5 \
          .groupBy('session').agg(  F.count('aid').alias('aid_cnt') ) \
          .withColumn( 'aid_cnt_log', F.log( F.col('aid_cnt') ) )
          .select('aid_cnt_log')
      , bins = 20
      , color=['red'])

In [None]:
sDF_train5.groupBy('session').agg(  F.count('aid').alias('aid_cnt') ).show()#.apply(np.log1p).hist()