In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, regexp_replace, year, count, row_number, lower, to_timestamp
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType
import xml.etree.ElementTree as ET

spark = SparkSession.builder \
    .appName("LR2") \
    .getOrCreate()


In [42]:
xml_file = 'posts_sample.xml'
languages_file = "programming-languages.csv"


In [43]:
posts_schema = StructType([
    StructField("CreationDate", StringType(), True),
    StructField("Tags", StringType(), True)
])

In [44]:
# Функция для парсинга XML и создания RDD
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    data = []
    for row in root.findall('row'):
        creation_date = row.attrib.get('CreationDate')
        tags = row.attrib.get('Tags')
        if creation_date and tags:
            tags_clean = regexp_replace(col("Tags"), "[<>]", " ").alias("tags_clean")
            tags_list = tags.replace('<', ' ').replace('>', ' ').strip().split() #tags.split(">")
            for tag in tags_list:
                data.append((creation_date, tag))
    return data

In [47]:
# Создание DataFrame из XML данных
posts_data = parse_xml(xml_file)
posts_df = spark.createDataFrame(posts_data, schema=StructType([StructField("CreationDate", StringType(), True),StructField("Tag", StringType(), True)]))
posts_df = posts_df.withColumn("Year", year(to_timestamp(col("CreationDate"))))
posts_df.show(5)

+--------------------+---------------+----+
|        CreationDate|            Tag|Year|
+--------------------+---------------+----+
|2008-07-31T21:42:...|             c#|2008|
|2008-07-31T21:42:...| floating-point|2008|
|2008-07-31T21:42:...|type-conversion|2008|
|2008-07-31T21:42:...|         double|2008|
|2008-07-31T21:42:...|        decimal|2008|
+--------------------+---------------+----+
only showing top 5 rows



In [48]:
languages_df = spark.read.csv(languages_file, header=True)
languages_df = languages_df.withColumn("name", lower(col("name")))
posts_df = posts_df.withColumn("Tag", lower(col("Tag")))

# Выполнение join для фильтрации только тех тегов, которые есть в списке языков
filtered_df = posts_df.join(languages_df, posts_df["Tag"] == languages_df["name"], "inner")
filtered_df.show(5)

+--------------------+----+----+----+--------------------+
|        CreationDate| Tag|Year|name|       wikipedia_url|
+--------------------+----+----+----+--------------------+
|2010-09-23T12:13:...|java|2010|java|https://en.wikipe...|
|2010-09-26T17:07:...| php|2010| php|https://en.wikipe...|
|2010-09-30T18:27:...|ruby|2010|ruby|https://en.wikipe...|
|2010-10-01T11:52:...|   c|2010|   c|https://en.wikipe...|
|2010-10-04T21:05:...| php|2010| php|https://en.wikipe...|
+--------------------+----+----+----+--------------------+
only showing top 5 rows



In [49]:
tag_counts = filtered_df.groupBy("Year", "Tag").agg(count("*").alias("Count"))

# Определение окна для ранжирования языков по популярности в каждом году
window_spec = Window.partitionBy("Year").orderBy(col("Count").desc())

# Вычисление ранга для каждого языка в каждом году и фильтрация топ-10
top10_df = tag_counts.withColumn("Rank", row_number().over(window_spec)) \
                     .filter(col("Rank") <= 10) \
                     .orderBy("Year", "Rank")
top10_df.show(15)

+----+----------+-----+----+
|Year|       Tag|Count|Rank|
+----+----------+-----+----+
|2008|      java|    5|   1|
|2008|      ruby|    4|   2|
|2008|         c|    2|   3|
|2008|javascript|    2|   4|
|2008|       x++|    1|   5|
|2008|    python|    1|   6|
|2008|        io|    1|   7|
|2008|    groovy|    1|   8|
|2008|       php|    1|   9|
|2009|      java|   28|   1|
|2009|    python|   23|   2|
|2009|       php|   22|   3|
|2009|javascript|   12|   4|
|2009|      ruby|    8|   5|
|2009|    delphi|    7|   6|
+----+----------+-----+----+
only showing top 15 rows



In [50]:
# Сохранение DataFrame в Parquet
top10_df.write.mode("overwrite").parquet("top.parquet")

In [51]:
from google.colab import files
# Архивация результата
!zip -r top.zip top.parquet
files.download("top.zip")

  adding: top.parquet/ (stored 0%)
  adding: top.parquet/._SUCCESS.crc (stored 0%)
  adding: top.parquet/part-00000-4925d76d-45bd-451e-b5b8-25c927b9f5bc-c000.snappy.parquet (deflated 36%)
  adding: top.parquet/_SUCCESS (stored 0%)
  adding: top.parquet/.part-00000-4925d76d-45bd-451e-b5b8-25c927b9f5bc-c000.snappy.parquet.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
result_df = spark.read.parquet("top.parquet")
result_df.show(truncate=False, n=500)

+----+-----------+-----+----+
|Year|Tag        |Count|Rank|
+----+-----------+-----+----+
|2008|java       |5    |1   |
|2008|ruby       |4    |2   |
|2008|c          |2    |3   |
|2008|javascript |2    |4   |
|2008|x++        |1    |5   |
|2008|python     |1    |6   |
|2008|io         |1    |7   |
|2008|groovy     |1    |8   |
|2008|php        |1    |9   |
|2009|java       |28   |1   |
|2009|python     |23   |2   |
|2009|php        |22   |3   |
|2009|javascript |12   |4   |
|2009|ruby       |8    |5   |
|2009|delphi     |7    |6   |
|2009|c          |6    |7   |
|2009|objective-c|6    |8   |
|2009|haskell    |4    |9   |
|2009|bash       |3    |10  |
|2010|java       |52   |1   |
|2010|php        |46   |2   |
|2010|javascript |44   |3   |
|2010|python     |26   |4   |
|2010|objective-c|23   |5   |
|2010|c          |20   |6   |
|2010|ruby       |12   |7   |
|2010|delphi     |8    |8   |
|2010|applescript|3    |9   |
|2010|r          |3    |10  |
|2011|php        |102  |1   |
|2011|java