In [24]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [25]:
import os
from pyspark import SparkContext, SparkConf
from typing import NamedTuple
from datetime import datetime
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql import functions as func
from pyspark.sql.window import Window
import numpy as np
import re
import findspark
from pyspark.sql import SparkSession
findspark.init()

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [26]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName("LR2")\
        .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.14.0")\
        .getOrCreate()

In [27]:
posts_path = '/content/posts_sample.xml'
prog_lang_path = '/content/programming-languages.csv'

In [28]:
posts_data = spark.read.format('xml').options(rowTag='row').load(posts_path)
print("Posts")
posts_data.printSchema()

Posts
root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)



In [29]:
prog_lang_data = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y H:m')\
.csv(prog_lang_path)
print("Programming languages")
prog_lang_data.printSchema()

Programming languages
root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)



In [30]:
language_names = [row.name.lower() for row in prog_lang_data.select("name").collect()]

# Функция для идентификации языков программирования в тегах
def extract_programming_languages(tag_string):
    if not tag_string:
        return []
    found_tags = re.findall(r'<([^<>]*)>', tag_string)
    identified_languages = [tag for tag in found_tags if tag in language_names]
    return identified_languages


# Регистрация пользовательской функции
extract_langs_udf = func.udf(extract_programming_languages, ArrayType(StringType()))

# Обработка данных постов
processed_posts = (posts_data
                  .withColumn("languages", extract_langs_udf(posts_data._Tags))
                  .withColumn("post_year", func.year(posts_data._CreationDate))
                  .select("post_year", func.explode("languages").alias("language"), "_ViewCount"))

# Фильтрация по временному периоду
filtered_posts = processed_posts.filter((processed_posts.post_year.between(2010, 2020)))

# Агрегация данных по популярности
language_popularity = (filtered_posts
                      .groupBy("post_year", "language")
                      .agg(func.sum("_ViewCount").alias("total_views")))

# Ранжирование языков по популярности
ranking_spec = Window.partitionBy('post_year').orderBy(func.desc('total_views'))
ranked_languages = language_popularity.withColumn('position', func.row_number().over(ranking_spec))

# Выбор топ-10 языков для каждого года
top_10_languages = ranked_languages.where(ranked_languages['position'] <= 10)

top_10_languages.write.mode('overwrite').format('parquet').save('top_10_languages_yearly.parquet')

saved_results = spark.read.parquet('top_10_languages_yearly.parquet')
for year in range(2010, 2021):
    print(f"\n=== ТОП-10 языков программирования за {year} год ===")
    (saved_results
        .filter(saved_results.post_year == year)
        .orderBy("position")
        .show(truncate=False))


=== ТОП-10 языков программирования за 2010 год ===
+---------+-----------+-----------+--------+
|post_year|language   |total_views|position|
+---------+-----------+-----------+--------+
|2010     |php        |1189629    |1       |
|2010     |java       |563211     |2       |
|2010     |javascript |316131     |3       |
|2010     |objective-c|97009      |4       |
|2010     |ruby       |76215      |5       |
|2010     |c          |66587      |6       |
|2010     |python     |60672      |7       |
|2010     |matlab     |51865      |8       |
|2010     |applescript|32305      |9       |
|2010     |delphi     |13065      |10      |
+---------+-----------+-----------+--------+


=== ТОП-10 языков программирования за 2011 год ===
+---------+-----------+-----------+--------+
|post_year|language   |total_views|position|
+---------+-----------+-----------+--------+
|2011     |javascript |809078     |1       |
|2011     |java       |389834     |2       |
|2011     |php        |246770     |3    