# Yahoo Topic Classification

Here we import the reduced data from Hive into pyspark dataframe

## Import Libraries

In [1]:
# Basic
import os
import pandas as pd
import numpy as np
import findspark
import pyspark
from pyspark import SparkFiles
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
# Here are some variables that need to be checked before running following commands
diarmuid_path = 'C:\\Users\\Diarmuid\\Documents\\dcu\\final-year\\ca4022_data_at_speed_and_scale\\Assignments\\ca4022_yahoo_topic_classification\\notebooks'
if os.getcwd() == diarmuid_path:
    #os.environ['JAVA_HOME'] = 'C:\Users\Diarmuid\anaconda3\envs\yahoo\Library\lib\jvm'
    #os.environ['HADOOP_HOME'] = 'C:\spark-3.3.1-bin-hadoop3\'
    #os.environ['SPARK_HOME'] = 'C:\spark-3.3.1-bin-hadoop3\'
    print(os.environ['JAVA_HOME'])
    print(os.environ['HADOOP_HOME'])
    print(os.environ['SPARK_HOME'])


C:\Users\Diarmuid\anaconda3\envs\yahoo\Library\lib\jvm
C:\spark-3.3.1-bin-hadoop3\
C:\spark-3.3.1-bin-hadoop3\


In [3]:
findspark.init()
findspark.find()

'C:\\spark-3.3.1-bin-hadoop3\\'

In [4]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

## Using Original tables

Found that reducing the data using PySpark is more convenient than in Hive. When we used hive, the output table had delimeter issues which lead to parsing errors

In [5]:
# import data into spark dataframe
import pyspark.sql.functions as f
from pyspark.sql.window import Window

# Google Drive URL wasn't working for the large train file, so uploaded datasets to data folder manually
train = spark.read.csv("../data/train.csv")
test = spark.read.csv("../data/test.csv")

# rename columns
train = train.select(f.col("_c0").alias("topic").astype('int'), f.col("_c1").alias("q_title"), f.col("_c2").alias("q_content"), f.col("_c3").alias("answer"))
test = test.select(f.col("_c0").alias("topic").astype('int'), f.col("_c1").alias("q_title"), f.col("_c2").alias("q_content"), f.col("_c3").alias("answer"))

print("test shapea:", "({}, {})".format(test.count(), len(test.columns)))


test shapea: (60000, 4)


In [6]:
test.show(5)

+-----+--------------------+--------------------+--------------------+
|topic|             q_title|           q_content|              answer|
+-----+--------------------+--------------------+--------------------+
|    9|What makes friend...|How does the spar...|good communicatio...|
|    2|Why does Zebras h...|What is the purpo...|this provides cam...|
|    4|What did the itsy...|                null|          waterspout|
|    4|What is the diffe...|                null|One difference be...|
|    3|Why do women get ...|                null|Premenstrual synd...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [7]:
# reduce both datasets to quarter size using stratified sampling

by_topic = train.groupby("topic").count()
by_topic = by_topic.withColumn("percent", f.col("count")/f.sum("count").over(Window.partitionBy()))
by_topic.orderBy('topic').select('topic', 'percent').show()

+-----+-------+
|topic|percent|
+-----+-------+
|    1|    0.1|
|    2|    0.1|
|    3|    0.1|
|    4|    0.1|
|    5|    0.1|
|    6|    0.1|
|    7|    0.1|
|    8|    0.1|
|    9|    0.1|
|   10|    0.1|
+-----+-------+



In [8]:
train_df = train.sampleBy(col='topic', fractions={i:0.25 for i in range(1, 11, 1)})
test_df = test.sampleBy(col='topic', fractions={i:0.25 for i in range(1, 11, 1)})

print("train shape:", "({}, {})".format(train_df.count(), len(train_df.columns)))
print("test shape:", "({}, {})".format(test_df.count(), len(test_df.columns)))


by_topic = train_df.groupby("topic").count()
by_topic = by_topic.withColumn("percent", f.col("count")/f.sum("count").over(Window.partitionBy()).astype('float'))
#by_topic = by_topic.select("topic", f.col("percent").astype('float'))
by_topic.orderBy('topic').select('topic', f.round('percent', 2)).show()

train shape: (349605, 4)
test shape: (15010, 4)
+-----+-----------------+
|topic|round(percent, 2)|
+-----+-----------------+
|    1|              0.1|
|    2|              0.1|
|    3|              0.1|
|    4|              0.1|
|    5|              0.1|
|    6|              0.1|
|    7|              0.1|
|    8|              0.1|
|    9|              0.1|
|   10|              0.1|
+-----+-----------------+



In [9]:
train_df = train_df.withColumn('Set', f.lit('Train'))
test_df = test_df.withColumn('Set', f.lit('Test'))

In [10]:
df = train_df.union(test_df)

In [11]:
for old, new in [('topic', 'Label'), ('q_title', 'Title'), ('q_content', 'Content')]:
    df = df.withColumnRenamed(old, new)

In [12]:
df.show(5)

+-----+--------------------+--------------------+--------------------+-----+
|Label|               Title|             Content|              answer|  Set|
+-----+--------------------+--------------------+--------------------+-----+
|    5|why doesn't an op...|or even on some s...|Optical mice use ...|Train|
|    3|What is Trans Fat...|I heard that tras...|"Trans fats occur...|Train|
|    7|How many planes F...|I heard that it i...|according to the ...|Train|
|    8|"Is Lin Qingxia (...|This is according...|"Well.  Everyone ...|Train|
|    4|Who said the stat...|"Can someone help...|"That is kind of ...|Train|
+-----+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [13]:
# save the reduced data
train_df.write.csv("../data/train_reduced")
test_df.write.csv("../data/test_reduced")

In [14]:
df.write.csv('../data/reduced')