In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
from pyspark.ml.feature import CountVectorizer, VectorAssembler
from pyspark.ml.stat import Summarizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType, FloatType, StringType, ArrayType

from numpy import array
from numpy.linalg import norm

from pyspark.ml.classification import GBTClassifier

In [4]:
items = spark.read.csv('/labs/slaba03/laba03_items.csv', sep='\t', header=True)
# views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', sep=',', header=True)

train = spark.read.csv('/labs/slaba03/laba03_train.csv', sep=',', header=True)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', sep=',', header=True)

In [7]:
@f.pandas_udf(ArrayType(StringType()), f.PandasUDFType.SCALAR)
def split(series):
    return series.apply(lambda x: x.split(',') if isinstance(x, str) else list())

items0 = items.withColumn('genre_list', split(items.genres))

count_vectorizer = CountVectorizer(inputCol='genre_list', outputCol='genre_vector')
items1 = count_vectorizer.fit(items0).transform(items0).cache()

items1.select('genres', 'genre_list', 'genre_vector').where(f.size(f.col('genre_list')) > 1).show(3)

+--------------------+--------------------+--------------------+
|              genres|          genre_list|        genre_vector|
+--------------------+--------------------+--------------------+
|   Комедии,Мелодрамы|[Комедии, Мелодрамы]|(84,[3,8],[1.0,1.0])|
|Ужасы,Триллеры,Др...|[Ужасы, Триллеры,...|(84,[1,2,4,11,13]...|
|Ужасы,Комедии,Фан...|[Ужасы, Комедии, ...|(84,[1,3,11,13],[...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [10]:
user_response = train.groupBy('user_id')\
                .agg(f.sum('purchase').alias('purchase_cnt'), f.count('item_id').alias('item_cnt'))\
                .withColumn('user_response', f.col('purchase_cnt')/f.col('item_cnt')) \
                .select('user_id', 'user_response')

user_data = train.where(train.purchase == '1') \
            .join(
                items1.select('item_id', 'genre_vector')
                .where(items1.content_type == '1'), on='item_id', how='inner'
            ) \
            .groupBy('user_id') \
            .agg(Summarizer.mean(items1.genre_vector).alias('genre_pref')) \
            .join(user_response, on='user_id', how='inner')

item_response = train.groupBy('item_id') \
                .agg(f.sum('purchase').alias('purchase_cnt'), f.count('item_id').alias('item_cnt')) \
                .withColumn('item_response', f.col('purchase_cnt')/f.col('item_cnt')) \
                .select('item_id', 'item_response')

item_data = items1.where(items1.content_type == '1') \
            .join(item_response, on='item_id', how='inner') \
            .select('item_id', 'genre_vector', 'item_response')  

data = train.join(user_data, on='user_id', how='inner') \
       .join(item_data, on='item_id', how='inner').cache()

data.show(3)

+-------+-------+--------+--------------------+--------------------+--------------+-------------+
|item_id|user_id|purchase|          genre_pref|       user_response|  genre_vector|item_response|
+-------+-------+--------+--------------------+--------------------+--------------+-------------+
| 100140| 766847|       0|[0.0,0.0,0.0,0.0,...|0.001920122887864...|(84,[3],[1.0])|          0.0|
| 100140| 779642|       0|[0.0,0.3333333333...|0.001166861143523...|(84,[3],[1.0])|          0.0|
| 100140| 924533|       0|[0.0,0.4,0.2,0.0,...|0.001933488012374...|(84,[3],[1.0])|          0.0|
+-------+-------+--------+--------------------+--------------------+--------------+-------------+
only showing top 3 rows



In [11]:
@f.udf(FloatType())
def cos_sim(a, b):
    a, b = DenseVector(a), DenseVector(b)
    n = norm(a, 2) * norm(b, 2)
    if n != 0:
        return float(a.dot(b)/n)
    else:
        return 0.0

@f.pandas_udf(IntegerType(), f.PandasUDFType.SCALAR)
def get_label(series):
    return series.apply(lambda x: 1 if x == '1' else 0)

data = data.withColumn('user_item_sim', cos_sim(data.genre_pref, data.genre_vector))
data = data.withColumn('label', get_label(data.purchase))

dataset = VectorAssembler(
    inputCols=['user_response', 'item_response', 'user_item_sim'],
    outputCol='features'
).transform(data).select('label', 'features')
cached_dataset = dataset.cache()
cached_dataset.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.00192012288786...|
|    0|[0.00116686114352...|
|    0|[0.00193348801237...|
+-----+--------------------+
only showing top 3 rows



Обучение модели

In [None]:
estimator = GBTClassifier()
model = estimator.fit(cached_dataset)

сборка тестового датасета

In [None]:
length = len(test_data.take(1)[0].genre_vector)
zero_vec = Sparse_Vector(length, list())
print(length)

In [10]:
test_data = test.join(user_data, on='user_id', how='left') \
                .join(item_data, on='item_id', how='left')

test_data = test_data.na.fill(value=zero_vec)

test_data = test_data.withColumn('user_item_sim', cos_sim(test_data.genre_pref, test_data.genre_vector))
test_data = test_data.withColumn('label', get_label(test_data.purchase))

test_dataset = VectorAssembler(
    inputCols=['user_response', 'item_response', 'user_item_sim'],
    outputCol='features'
).transform(test_data).select('user_id', 'item_id', 'features')
cached_test_dataset = test_dataset.cache()
cached_test_dataset.show(3)

+-------+-------+--------------------+
|user_id|item_id|            features|
+-------+-------+--------------------+
| 748042| 100140|[0.00153080750095...|
| 855465| 100140|[0.00155279503105...|
| 878352| 100140|[0.00115517905275...|
+-------+-------+--------------------+
only showing top 3 rows



Получение prediction

In [29]:
predict_data = model.transform(test_dataset)
cached_predict_data = predict_data.cache()
cached_predict_data.show()

In [34]:
@f.udf(FloatType())
def get_prob(v):
    return float(v[1])

predict_data_final = predict_data.withColumn('purchase', get_prob(predict_data.probability)) \
                                 .select('user_id', 'item_id', 'purchase') \
                                 .repartition(1) \
                                 .orderBy('user_id', 'item_id')

predict_data_final.toPandas().to_csv('/data/home/margarita.cherentsova/lab03_unsorted.csv')

In [None]:
spark.stop()