# Initializing and getting session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, DateType

In [4]:
from pyspark.sql import SparkSession
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.13.0 pyspark-shell'

spark_session = SparkSession\
    .builder\
    .getOrCreate()

In [5]:
spark_session

In [6]:
sc = spark_session._sc

# Opening files

## programming languages

In [7]:
lang_df = spark_session.read.option("header", True).option("inferSchema", True).csv("file:///home/workspace/data/programming-languages.csv")

In [8]:
lang_df

DataFrame[name: string, wikipedia_url: string]

In [9]:
langs_list = [row[0] for row in lang_df.select('name').collect()]

In [10]:
langs_list[:20]

['A# .NET',
 'A# (Axiom)',
 'A-0 System',
 'A+',
 'A++',
 'ABAP',
 'ABC',
 'ABC ALGOL',
 'ABSET',
 'ABSYS',
 'ACC',
 'Accent',
 'Ace DASL',
 'ACL2',
 'ACT-III',
 'Action!',
 'ActionScript',
 'Ada',
 'Adenine',
 'Agda']

## posts

In [11]:
posts_df = spark_session.read.format("xml").options(rowTag="row").load('file:///home/workspace/data/posts_sample.xml') 

In [12]:
posts_df.head()

Row(_AcceptedAnswerId=7, _AnswerCount=13, _Body="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code></p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n", _ClosedDate=None, _CommentCount=2, _CommunityOwnedDate=datetime.datetime(2012, 10, 31, 16, 42, 47, 213000), _CreationDate=datetime.datetime(2008, 7, 31, 21, 42, 52, 667000), _FavoriteCount=48, _Id=4, _LastActivityDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditDate=datetime.datetime(2019, 7, 19, 1, 39, 54, 173000), _LastEditorDisplayName='Rich B', _LastEditorUserId=3641067, _OwnerDisplayName=None, _OwnerUse

In [13]:
posts_df.head()._Tags

'<c#><floating-point><type-conversion><double><decimal>'

# Helper functions

In [14]:
def get_id_lang_year_by_row(row):
    language_tag = None
    for lang in langs_list:
        if f'<{lang.lower()}>' in row._Tags.lower():
            language_tag = lang
            break
    if language_tag is None:
        return None
    return (row._Id, language_tag, row._CreationDate.year)

# Solution

In [15]:
id_lang_year_rdd = posts_df.rdd\
    .filter(lambda row: row._Tags is not None)\
    .map(get_id_lang_year_by_row)\
    .filter(lambda row: row is not None)

In [16]:
year_lang_count_rdd = id_lang_year_rdd\
    .keyBy(lambda row: (row[2], row[1]))\
    .aggregateByKey( 
        0,
        lambda acc, value: acc + 1,
        lambda acc1, acc2: acc1 + acc2,
    )\
    .map(lambda row: (row[0][0], row[0][1], row[1]))

In [17]:
year_lang_count_rdd.cache()

PythonRDD[42] at RDD at PythonRDD.scala:53

In [18]:
result = []

for year in range(2010, 2020 + 1):
    result.extend(
        year_lang_count_rdd
        .filter(lambda row: row[0] == year)
        .top(10, key=lambda row: row[2])
    )

In [19]:
top_languages = sc.parallelize(result).toDF().toDF("Year", "Language", "Count")

In [20]:
top_languages.show(n=110)

+----+-----------+-----+
|Year|   Language|Count|
+----+-----------+-----+
|2010|       Java|   52|
|2010| JavaScript|   44|
|2010|        PHP|   42|
|2010|     Python|   25|
|2010|Objective-C|   23|
|2010|          C|   20|
|2010|       Ruby|   11|
|2010|     Delphi|    7|
|2010|          R|    3|
|2010|       Perl|    3|
|2011|        PHP|   97|
|2011|       Java|   92|
|2011| JavaScript|   82|
|2011|     Python|   35|
|2011|Objective-C|   33|
|2011|          C|   24|
|2011|       Ruby|   17|
|2011|     Delphi|    8|
|2011|       Perl|    8|
|2011|       Bash|    7|
|2012|        PHP|  136|
|2012| JavaScript|  129|
|2012|       Java|  124|
|2012|     Python|   65|
|2012|Objective-C|   45|
|2012|          C|   27|
|2012|       Ruby|   25|
|2012|       Bash|    9|
|2012|          R|    9|
|2012|     MATLAB|    6|
|2013| JavaScript|  196|
|2013|       Java|  191|
|2013|        PHP|  173|
|2013|     Python|   87|
|2013|Objective-C|   40|
|2013|          C|   36|
|2013|       Ruby|   30|


In [21]:
# temp = year_lang_count_rdd\
# .keyBy(lambda row: row[0])\
# .groupByKey()\
# .mapValues(lambda one_year_table: sc.parallelize(one_year_table).top(10, key=lambda row: row[2]))

# Something like this is prohibited. We are not allowed to ude SparkContext like this.
# The error says:
# "It appears that you are attempting to reference SparkContext from a broadcast " Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

In [22]:
# Task solution wothout a requirement of selection of top-10

# sorted_languages = posts_df.rdd\
#     .filter(lambda row: row._Tags is not None)\
#     .map(get_id_lang_year_by_row)\
#     .filter(lambda row: row is not None)\
#     .keyBy(lambda row: (row[2], row[1]))\
#     .aggregateByKey( 
#         0,
#         lambda acc, value: acc + 1,
#         lambda acc1, acc2: acc1 + acc2,
#     )\
#     .map(lambda x: (x[0][0], x[0][1], x[1]))\
#     .sortBy(lambda row: (row[0], row[2]), ascending=False)\
#     .toDF()

# sorted_languages.collect()

# Rename cols and save DataFrame to .parquet

In [23]:
!hadoop fs -ls

Found 4 items
drwxr-xr-x   - root root          4 2021-12-14 18:01 MegaBestLanguages.parquet
drwxr-xr-x   - root root          1 2021-12-14 18:23 rail
drwxr-xr-x   - root root          0 2021-12-14 18:06 spark-warehouse
drwxr-xr-x   - root root          3 2021-12-23 18:32 top_languages.parquet


In [24]:
!hadoop fs -rm -R top_languages.parquet

Deleted top_languages.parquet


In [25]:
top_languages.write.parquet("top_languages.parquet")

In [26]:
!hadoop fs -ls top_languages.parquet

Found 3 items
-rwxr-xr-x   3 root root          0 2021-12-23 18:36 top_languages.parquet/_SUCCESS
-rwxr-xr-x   3 root root       1318 2021-12-23 18:36 top_languages.parquet/part-00000-1563ba61-c6e4-4dbf-8455-2cd97c78b8ec-c000.snappy.parquet
-rwxr-xr-x   3 root root       1343 2021-12-23 18:36 top_languages.parquet/part-00001-1563ba61-c6e4-4dbf-8455-2cd97c78b8ec-c000.snappy.parquet
