# preparations

## imports

In [1]:
import os
import sys
import json

## spark 

In [None]:
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

## lab conditions

In [3]:
# condition structure id|language|course_name
given_courses = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

# data analysis

## data load

In [5]:
courses_path = '/labs/slaba02/DO_record_per_line.json'
courses_info = spark.read.json(courses_path)
print(f'number of observations: {courses_info.count()}')
courses_info.show(5)

number of observations: 28153
+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



## recommendation pipeline

In [6]:
# imports
import re                                                  # for regular expressions
from pyspark.ml.feature import StopWordsRemover            # for stopwords
from pyspark.ml.feature import HashingTF                   # for hashing term frequences
from pyspark.ml.feature import IDF                         # IDF
from pyspark.ml import Pipeline                            # pipeline consturctor
from pyspark.ml.feature import Tokenizer                   # for tokenization
           
from pyspark.sql.functions import pandas_udf               # for regular expressions
from pyspark.sql.functions import regexp_replace           # for regular expressions
import pyspark.sql.types as T
from pyspark.sql.window import Window                      # for selecting top-10
from pyspark.sql.functions import col, row_number          ##############################
  
from pyspark.ml.feature import Normalizer                  # for L2 norm computation
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix                         

In [7]:
# start with combine all information at one column `full_info`
# .1) separate categories by '/'
courses_info = courses_info.withColumn('cat_cleaned', regexp_replace('cat', '/', ' '))
courses_info = courses_info.withColumn('cat_cleaned', regexp_replace('cat_cleaned', '_', ' '))
courses_info = courses_info.withColumn('cat_cleaned', regexp_replace('cat_cleaned', '\|', ' '))

courses_info = courses_info.withColumn(
    'full_info', F.concat_ws(' ', F.col('cat_cleaned'), F.col('desc'), F.col('name'))
)

In [8]:
# 1) remove stopwords and punctuation from description...
# 1.1) remove punсtuation firstly
regex_punctuations = '[!@"“’«»#$%&\'()*+,—/:;<=>?^_`{|}~\[\]]'
courses_info = courses_info.withColumn('cleaned_desc', regexp_replace('full_info', regex_punctuations, ''))

# 1.2) lower string values
courses_info = courses_info.withColumn('cleaned_desc', F.lower(F.col('cleaned_desc')))

# 1.3) removing number either
courses_info = courses_info.withColumn('cleaned_desc', regexp_replace('cleaned_desc', r'[0-9]', ''))

# 1.4) delete stopWords
# 1.4.1) tokenize description
tokenizer = Tokenizer(inputCol='cleaned_desc', outputCol='tokenz')
tokenized = tokenizer.transform(courses_info)

# 1.4.2) fit StopWordsRemover
remover = StopWordsRemover(inputCol='tokenz', outputCol='filtered_tokenz')
filtered_tokenized = remover.transform(tokenized)

In [9]:
# calculate metric
# 1) calculate term frequency (TF)
hashingTF = HashingTF(inputCol='filtered_tokenz', outputCol='features')
hashingTF.setNumFeatures(10000)
TF = hashingTF.transform(filtered_tokenized)

# 2) calculate IDF
IDF_ = IDF(inputCol='features', outputCol='IDF').fit(TF)
TF_IDF = IDF_.transform(TF)

In [30]:
# 3) get all candidates for recommendations
columns = ['id', 'lang', 'IDF', 'name']
course_candidates_ids = [values[0] for values in given_courses]
course_candidates = TF_IDF.filter(F.col('id').isin(course_candidates_ids)).select(columns)
course_candidates_spark = course_candidates.select('id').distinct().rdd.flatMap(lambda value: value).collect()
assert set(course_candidates_ids) - set(course_candidates_spark) == set([])
assert set(course_candidates_spark) - set(course_candidates_ids) == set([])

# 3.1) join to them, all other courses on candidates' language
course_recommended = TF_IDF.filter(
    ~F.col('id').isin(course_candidates_ids)
).select([F.col(column).alias(column + '_recommended') for column in columns])
course_candidates = course_candidates.crossJoin(course_recommended)
course_candidates = course_candidates.filter(F.col('lang') == F.col('lang_recommended'))

In [31]:
# 3.2) cosine similarity calculation
@udf(returnType=T.DoubleType())
def cosine_similarity(v1, v2):
    return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

course_candidates = course_candidates.withColumn(
    'cosine_similarity', cosine_similarity(F.col('IDF'), F.col('IDF_recommended'))
)

In [32]:
# 4) result preparation
# 4.1) check the same languages
lab02 = course_candidates.filter(F.col('lang') == F.col('lang_recommended'))

# 4.2) select only top-10
window_ = Window.partitionBy("id").orderBy(
    F.asc('id'), F.desc('cosine_similarity'), F.asc('name_recommended'), F.asc('id_recommended')
)
lab02 = lab02.withColumn("row", row_number().over(window_))
lab02 = lab02.filter(col("row") <= 10).orderBy(
    F.asc('id'), F.desc('cosine_similarity'), F.asc('name_recommended'), F.asc('id_recommended')
)

In [33]:
# 4.3) compress into .json
result_json = {
    row['id']: row['collect_set(id_recommended)'] for row in \
    lab02.select('id', 'id_recommended').groupby("id").agg(F.collect_set("id_recommended")).collect()
} 

# save json
with open('lab02.json', 'w') as file:
    json.dump(result_json, file)

In [34]:
result_json

{23126: [25782, 13348, 14760, 2724, 24419, 13665, 13782, 23756, 15909, 20638],
 16627: [13529, 13021, 12247, 11431, 25010, 12863, 12660, 11575, 5687, 17964],
 13702: [795, 948, 8082, 956, 1052, 1110, 853, 1216, 8313, 864],
 16704: [1376, 1236, 18331, 8154, 1426, 1229, 1164, 1365, 20105, 8203],
 11556: [23357, 9289, 10447, 16488, 19330, 22710, 13461, 10384, 7833, 468],
 21617: [21616, 21703, 21506, 21608, 21492, 21609, 21624, 21675, 21508, 21700]}

# spark context stop

In [35]:
spark.stop()