# Yahoo Topic Classification

Here we import the reduced data from Hive into pyspark dataframe

## Import Libraries

In [1]:
# Basic
import pandas as pd
import numpy as np
import findspark
import pyspark
from pyspark import SparkFiles
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
findspark.init()
findspark.find()

'/mnt/c/Users/joluw/hadoop/spark-3.2.2'

In [3]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

## Using Original tables

Found that reducing the data using PySpark is more convenient than in Hive. When we used hive, the output table had delimeter issues which lead to parsing errors

In [4]:
# import data into spark dataframe
import pyspark.sql.functions as f
from pyspark.sql.window import Window

# Google Drive URL wasn't working for the large train file, so uploaded datasets to data folder manually
train = spark.read.csv("../data/train.csv")
test = spark.read.csv("../data/test.csv")

# rename columns
train = train.select(f.col("_c0").alias("topic").astype('int'), f.col("_c1").alias("q_title"), f.col("_c2").alias("q_content"), f.col("_c3").alias("answer"))
test = test.select(f.col("_c0").alias("topic").astype('int'), f.col("_c1").alias("q_title"), f.col("_c2").alias("q_content"), f.col("_c3").alias("answer"))

print("test shapea:", "({}, {})".format(test.count(), len(test.columns)))


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

test shapea: (60000, 4)


In [5]:
test.show(5)

+-----+--------------------+--------------------+--------------------+
|topic|             q_title|           q_content|              answer|
+-----+--------------------+--------------------+--------------------+
|    9|What makes friend...|How does the spar...|good communicatio...|
|    2|Why does Zebras h...|What is the purpo...|this provides cam...|
|    4|What did the itsy...|                null|          waterspout|
|    4|What is the diffe...|                null|One difference be...|
|    3|Why do women get ...|                null|Premenstrual synd...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [6]:
# reduce both datasets to quarter size using stratified sampling

by_topic = train.groupby("topic").count()
by_topic = by_topic.withColumn("percent", f.col("count")/f.sum("count").over(Window.partitionBy()))
by_topic.orderBy('topic').select('topic', 'percent').show()

22/11/25 18:40:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/25 18:40:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/25 18:40:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/25 18:40:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+-------+
|topic|percent|
+-----+-------+
|    1|    0.1|
|    2|    0.1|
|    3|    0.1|
|    4|    0.1|
|    5|    0.1|
|    6|    0.1|
|    7|    0.1|
|    8|    0.1|
|    9|    0.1|
|   10|    0.1|
+-----+-------+



                                                                                

In [7]:
train_df = train.sampleBy(col='topic', fractions={i:0.25 for i in range(1, 11, 1)})
test_df = test.sampleBy(col='topic', fractions={i:0.25 for i in range(1, 11, 1)})

print("train shape:", "({}, {})".format(train_df.count(), len(train_df.columns)))
print("test shape:", "({}, {})".format(test_df.count(), len(test_df.columns)))


by_topic = train_df.groupby("topic").count()
by_topic = by_topic.withColumn("percent", f.col("count")/f.sum("count").over(Window.partitionBy()).astype('float'))
#by_topic = by_topic.select("topic", f.col("percent").astype('float'))
by_topic.orderBy('topic').select('topic', f.round('percent', 2)).show()

                                                                                

train shape: (349667, 4)
test shape: (14877, 4)


22/11/25 18:40:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/25 18:40:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 18:>                                                         (0 + 8) / 8]

+-----+-----------------+
|topic|round(percent, 2)|
+-----+-----------------+
|    1|              0.1|
|    2|              0.1|
|    3|              0.1|
|    4|              0.1|
|    5|              0.1|
|    6|              0.1|
|    7|              0.1|
|    8|              0.1|
|    9|              0.1|
|   10|              0.1|
+-----+-----------------+



22/11/25 18:40:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [9]:
# save the reduced data
train_df.write.csv("../data/train_reduced")
test_df.write.csv("../data/test_reduced")

                                                                                