In [23]:
from pyspark.sql import *
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StringType
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("DataAnalysis") \
    .getOrCreate()

In [24]:
# Define schema for the CSV file
schema = StructType().add("publish_date", StringType()) \
                     .add("headline_category", StringType()) \
                     .add("headline_text", StringType())

In [25]:
# Read CSV file with explicit schema
df = spark.read.csv("hdfs://localhost:9000/data/india-news-headlines_copy_1.csv", 
                    header=True, schema=schema)

In [26]:
# Register DataFrame as a temporary view
df.createOrReplaceTempView("india_news_headlines")

In [27]:
import time
# Measure execution time for DataFrame operations
start_time_df = time.time()

In [29]:
# Perform SQL operations using DataFrame API
result_df = spark.sql("""
    WITH temp_table AS (
        SELECT publish_date, headline_category, headline_text
        FROM india_news_headlines
        WHERE publish_date BETWEEN '20010102' AND '20010110'
    )
    SELECT headline_category, COUNT(*) AS num_headlines
    FROM temp_table
    GROUP BY headline_category
    ORDER BY num_headlines DESC
    LIMIT 10
""")

In [30]:
end_time_df = time.time()
execution_time_df = end_time_df - start_time_df

In [31]:
# Display DataFrame result
result_df.show()

+--------------------+-------------+
|   headline_category|num_headlines|
+--------------------+-------------+
|             unknown|          630|
|          city.patna|            5|
|               india|            2|
|entertainment.eng...|            2|
|entertainment.hin...|            2|
|      city.bengaluru|            2|
|          city.delhi|            1|
|           edit-page|            1|
|business.india-bu...|            1|
+--------------------+-------------+



In [45]:
import time
from pyspark import SparkContext

# Create SparkContext
sc = SparkContext.getOrCreate()

In [46]:
# Load data into RDD
rdd = sc.textFile("hdfs://localhost:9000/data/india-news-headlines_copy_1.csv") \
        .map(lambda line: line.split(","))

In [47]:
# Filter data based on date range and map to key-value pairs
filtered_rdd = rdd.filter(lambda row: '20010102' <= row[0] <= '20010110') \
                  .map(lambda row: (row[1], 1))

In [48]:
# Perform RDD operations
start_time_rdd = time.time()

In [49]:
# Reduce by key to aggregate counts
aggregated_rdd = filtered_rdd.reduceByKey(lambda a, b: a + b)

In [51]:
# Swap key-value pairs
swapped_rdd = aggregated_rdd.map(lambda x: (x[1], x[0]))

In [56]:
try:
    # Sort by count in descending order
    sorted_rdd = swapped_rdd.sortByKey(ascending=False)

    # Take top 10 records
    result_rdd = sorted_rdd.take(10)

    # Display RDD result
    print("Top 10 records after sorting:")
    for item in result_rdd:
        print(item)

except Exception as e:
    print("Error occurred while sorting RDD:", e)

Error occurred while sorting RDD: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 43.0 failed 1 times, most recent failure: Lost task 0.0 in stage 43.0 (TID 74) (DESKTOP-9KE5AV4 executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.

In [61]:
# Measure execution time for DataSet operations
start_time_ds = time.time()


In [62]:
# Perform SQL operations using DataSet API directly from DataFrame
result_ds = df.filter(func.col("publish_date").between('20010102', '20010110')) \
              .groupby("headline_category") \
              .count() \
              .orderBy("count", ascending=False) \
              .limit(10)

In [63]:
end_time_ds = time.time()

In [64]:
execution_time_ds = end_time_ds - start_time_ds

In [65]:
# Display DataSet result
result_ds.show()

+--------------------+-----+
|   headline_category|count|
+--------------------+-----+
|             unknown|  630|
|          city.patna|    5|
|               india|    2|
|entertainment.eng...|    2|
|entertainment.hin...|    2|
|      city.bengaluru|    2|
|          city.delhi|    1|
|           edit-page|    1|
|business.india-bu...|    1|
+--------------------+-----+



In [66]:
# Print execution times
print("Execution time using DataFrame: {} seconds".format(execution_time_df))
print("Execution time using DataSet: {} seconds".format(execution_time_ds))

Execution time using DataFrame: 10.28837776184082 seconds
Execution time using DataSet: 24.74338126182556 seconds


# Task 2

In [1]:
# Import necessary libraries
from pyspark import SparkContext

# Create SparkContext
sc = SparkContext()

In [2]:
# Load data from HDFS into RDD
rdd = sc.textFile("hdfs://localhost:9000/data/india-news-headlines_copy_1.csv")

In [3]:
# Define your processing functions
def process_map(line):
    # Perform map operation
    return line.split(',')

def process_flatMap(line):
    # Perform flatMap operation
    return line.split(',')

def process_reduceByKey(line):
    # Perform reduceByKey operation
    return (line[1], 1)

In [4]:
# Apply RDD transformations
rdd_map = rdd.map(process_map)
rdd_flatMap = rdd.flatMap(process_flatMap)
rdd_reduceByKey = rdd_map.map(process_reduceByKey).reduceByKey(lambda x, y: x + y)

In [6]:
#results
print("RDD with map transformation:")
try:
    print(rdd_map.take(5))
except Exception as e:
    print("An error occurred while executing take(5) on RDD with map transformation:", str(e))

print("\nRDD with flatMap transformation:")
try:
    print(rdd_flatMap.take(10))
except Exception as e:
    print("An error occurred while executing take(10) on RDD with flatMap transformation:", str(e))

print("\nRDD with reduceByKey transformation:")
try:
    print(rdd_reduceByKey.take(5))
except Exception as e:
    print("An error occurred while executing take(5) on RDD with reduceByKey transformation:", str(e))


RDD with map transformation:
An error occurred while executing take(5) on RDD with map transformation: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (DESKTOP-9KE5AV4 executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.com