In [1]:
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Task2") \
    .getOrCreate()

In [2]:
#load dataset from hdfs
df=spark.read.option("header", True).csv('hdfs://localhost:9000/input/india-news-headlines.csv')
# De-weight the entire DataFrame
df_distinct = df.dropDuplicates()
# Convert to rdd
news_rdd = df_distinct.rdd

# 1. The top 10 years with the most news releases were counted

In [3]:
#use rdd map method to convert dataset
news_year_rdd = news_rdd.map(lambda row: (row[0][:4],1))
# Converts an RDD to a DataFrame and specifies the column names
news_year_df = spark.createDataFrame(news_year_rdd, ["Year", "Count"])
# Show DataFrame
news_year_df.show(10)

+----+-----+
|Year|Count|
+----+-----+
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
|2001|    1|
+----+-----+
only showing top 10 rows



In [4]:
news_year_count_rdd=news_year_rdd.reduceByKey(lambda a,b:a+b)
# Sorting RDDs by sortBy and descending by count
news_year_count_rdd = news_year_count_rdd.sortBy(lambda x: x[1], ascending=False).take(10)
# Converts an RDD to a DataFrame and specifies the column names
news_year_top_10 = spark.createDataFrame(news_year_count_rdd, ["Year", "Count"])
# Show DataFrame
news_year_top_10.show(10)

+----+------+
|Year| Count|
+----+------+
|2016|254168|
|2018|253294|
|2014|253240|
|2013|253098|
|2015|252966|
|2012|252915|
|2017|251375|
|2011|240784|
|2009|203990|
|2020|182087|
+----+------+



# 2. Specify the top 10 word frequencies in a news category

In [7]:
# Defining a list of deactivated words
stopwords = set(["the", "is", "in", "at", "which", "on", "and", "a", "to", "of"]) 

In [11]:
 #Filtering specific categories of data
filter_category = "history"   
filter_category_rdd = news_rdd.filter(lambda row: row[1] == filter_category)
# Extracting words from headings using flatMap
flatMap_rdd = filter_category_rdd.flatMap(lambda row: [(row[1], word) for word in row[2].split()])
# Filter out stop words
flatMap_rdd = flatMap_rdd.filter(lambda x: x[1].lower() not in stopwords)
# Converts an RDD to a DataFrame and specifies the column names
flatMap_df = spark.createDataFrame(flatMap_rdd, ["category", "word"])
# Show DataFrame
flatMap_df.show(10)

+--------+------------+
|category|        word|
+--------+------------+
| history|        1930|
| history|    Hamilton|
| history|     British|
| history|      Empire|
| history|       Games|
| history|        1958|
| history|     Cardiff|
| history|     British|
| history|      Empire|
| history|Commonwealth|
+--------+------------+
only showing top 10 rows



In [12]:
# Count the word frequency of each word
word_count_rdd = flatMap_rdd.map(lambda x: (x[1], 1)).reduceByKey(lambda a, b: a + b)
# Get the top 10 words with the most occurrences
top_10_words = word_count_rdd.sortBy(lambda x: x[1], ascending=False).take(10)
# Converts an RDD to a DataFrame and specifies the column names
top_10_words_df = spark.createDataFrame(top_10_words, ["word", "count"])
# Show DataFrame
top_10_words_df.show(10)

+------------+-----+
|        word|count|
+------------+-----+
|       Games|   19|
|Commonwealth|   15|
|     British|   10|
|      Empire|    8|
|     Thunder|    4|
|        Down|    4|
|       Under|    4|
|   Wimbledon|    4|
|        clay|    3|
|       Open:|    3|
+------------+-----+

