In [2]:
import os
import sys

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 1 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [4]:
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import Tokenizer, HashingTF, RegexTokenizer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.functions import udf, col
from pyspark.sql.types import FloatType
from json import dumps

In [5]:
!hdfs dfs -ls -h /labs/laba02/

Found 2 items
-rw-r--r--   3 hdfs hdfs     81.0 K 2022-01-06 18:46 /labs/laba02/autousers.json
drwxr-xr-x   - hdfs hdfs          0 2022-01-06 18:46 /labs/laba02/logs


In [6]:
# Read JSON file into dataframe
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")
df.printSchema()
df.show(3)

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 3 rows



In [9]:
data0 = spark.read.json('/labs/slaba02/DO_record_per_line.json')
data1 = data0.where(data0.lang.isin('en', 'es', 'ru')) \
             .drop('cat', 'provider', 'name') \
             .cache()

In [13]:
# токенизация
regex_tokenizer = RegexTokenizer(inputCol='desc', outputCol='words', pattern=r'[,\s\.\-]')
data2 = regex_tokenizer.transform(data1)

# расчет tf
hashing_tf = HashingTF(inputCol='words', outputCol='tf', numFeatures=10000)
data3 = hashing_tf.transform(data2).cache()

# обучение модели idf, расчет tf*idf
tf_idf_model = IDF(inputCol='tf', outputCol='tfIdf', minDocFreq=2).fit(data3)
data4 = tf_idf_model.transform(data3).cache()
data4.show(3)

+--------------------+---+----+--------------------+--------------------+--------------------+
|                desc| id|lang|               words|                  tf|               tfIdf|
+--------------------+---+----+--------------------+--------------------+--------------------+
|This course intro...|  4|  en|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|This online cours...|  5|  en|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|We live in a digi...|  7|  en|[we, live, in, a,...|(10000,[493,572,7...|(10000,[493,572,7...|
+--------------------+---+----+--------------------+--------------------+--------------------+
only showing top 3 rows



In [17]:
def get_similar_courses(data, course_id):
    """Функция, которая возвращает 10 наиболее схожих курсов для заданного курса по его id"""
    u = data.where(data.id == course_id).collect()[0].tfIdf
    u = DenseVector(u)
    u_norm = u.norm(2)

    @udf(returnType=FloatType())
    def cos_similarity(v):
        v = DenseVector(v)
        norm = (v.norm(2) * u_norm)
        if norm != 0:
            return float(v.dot(u)/norm)
        else:
            return 0.0

    # создание новой колонки, с расчетом косинусной близости
    # между векторами tf-idf заданного курса и остальных
    _data0 = data.select('id', cos_similarity(data.tfIdf).alias('cosSimilarity'))
    _data1 = _data0.repartition(1)

    # выборка 10 кусров с наибольшим cosSimilarity
    _data2 = _data1.orderBy(col('cosSimilarity').desc()).limit(10)
    return {course_id: [row.id for row in _data2.collect()]}

In [19]:
target_courses = {
    'en': [21617, 23126],
    'es': [11556, 16627],
    'ru': [13702, 16704],
}

similar_courses = dict()

for lang, course_id_list in target_courses.items():
    for course_id in course_id_list:
        print(f'Calculating for lang {lang}, course {course_id}')
        
        data5 = data4.where(data4.lang == lang)
        similar_courses.update(get_similar_courses(data5, course_id))
print('done')

Calculating for lang en, course 21617
Calculating for lang en, course 23126
Calculating for lang es, course 11556
Calculating for lang es, course 16627
Calculating for lang ru, course 13702
Calculating for lang ru, course 16704
done


**Загрузка результата**

In [None]:
from json import dumps

json = dumps(similar_courses, indent=4)

print(json)

with open('/data/home/margarita.cherentsova/lab02.json', 'w') as f:
    f.write(json)