# Yahoo Topic Classification

## Setting up Libraries and Environments

In [1]:
# Basic
import pandas as pd
import numpy as np
import findspark
import pyspark
from pyspark import SparkFiles

# Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from pyspark.sql.functions import col, udf, concat, lit, length, size, row_number
from pyspark.sql.window import Window

# ML
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram, VectorAssembler, CountVectorizer, IDF
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
findspark.init()

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

## Data Import

In [3]:
schema_list = [
    StructField('Label', IntegerType(), False), 
    StructField('Title', StringType(), True), 
    StructField('Content', StringType(), True),
    StructField('Answer', StringType(), True),
    StructField('Set', StringType(), True)
]
schema_df = StructType(fields=schema_list)

In [4]:
# Import Data from reduced folders
df = spark.read.csv("../data/reduced", schema=schema_df)

## Data Preview
This includes some basic view of the data before processing

In [5]:
df.show(5)

+-----+--------------------+--------+--------------------+-----+
|Label|               Title| Content|              Answer|  Set|
+-----+--------------------+--------+--------------------+-----+
|    1|what is the frenc...|    null|Are you talking a...|Train|
|    2|Do we realy need ...|    null|When surfaces are...|Train|
|    5|How can I save my...|    null|"Its 9 way to hac...|Train|
|    7|I'm trying to fin...|help me.|        tagworld.com|Train|
|    7|define Stability ...|    null|A Linear Time Inv...|Train|
+-----+--------------------+--------+--------------------+-----+
only showing top 5 rows



In [6]:
df.describe().show()

+-------+-----------------+--------------------+-----------------------------------+----------------------+------+
|summary|            Label|               Title|                            Content|                Answer|   Set|
+-------+-----------------+--------------------+-----------------------------------+----------------------+------+
|  count|           364615|              364615|                             200607|                357904|364615|
|   mean|5.494765711778177|                null|                           Infinity|              Infinity|  null|
| stddev|2.872483870744461|                null|                                NaN|                   NaN|  null|
|    min|                1|    ! only 4 girls!?|                                  !|  ! - 3 - 7 . Even ...|  Test|
|    max|               10|Describe in your...|黙れこのくそやろう\nむかつくんだ...|혼돈\nhttp://dictio...| Train|
+-------+-----------------+--------------------+-----------------------------------+-------------

## Data Processing

### Steps to Take
- Merge Test and Train
- Rename columns
- Change data types
- Tokenization
- Stop Word Removal
- NGram

#### Column Concatenation

In [7]:
df = df.withColumn('Document', concat(df['Title'], df['Content'], df['Answer'])).select(['Document', 'Set', 'Label'])
df.show(5)

+--------------------+-----+-----+
|            Document|  Set|Label|
+--------------------+-----+-----+
|                null|Train|    1|
|                null|Train|    2|
|                null|Train|    5|
|I'm trying to fin...|Train|    7|
|                null|Train|    7|
+--------------------+-----+-----+
only showing top 5 rows



In [8]:
before = df.count()
before_train = df.filter('Set == "Train"').count()
before_test = df.filter('Set == "Test"').count()
df = df.filter('Document is NOT NULL').withColumn('Length', length(df['Document']))
after = df.count()
after_train = df.filter('Set == "Train"').count()
after_test = df.filter('Set == "Test"').count()

In [9]:
print(f'''There were {before_train} training samples and {before_test} test samples''')
print(f'{before - after} rows were dropped, {before_train - after_train} training samples and {before_test - after_test} test samples.')
print(f'''There are now {df.filter('Set == "Train"').count()} training samples and {df.filter('Set == "Test"').count()} test samples''')

There were 349605 training samples and 15010 test samples
168244 rows were dropped, 161332 training samples and 6912 test samples.
There are now 188273 training samples and 8098 test samples


#### Tokenization (Regex)

In [10]:
regex_tokenizer = RegexTokenizer(inputCol='Document', outputCol='Tokens', pattern='\\W')

df_regex_token = regex_tokenizer.transform(df)

df_regex_token.select('Tokens').show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                              Tokens|
+----------------------------------------------------------------------------------------------------+
|[i, m, trying, to, find, the, url, for, this, website, similar, to, myspace, called, tag, how, do...|
|[in, baseball, game, or, just, baseball, wat, does, mrp, and, lrp, mean, its, on, my, mvp, 2004, ...|
|[how, can, i, make, my, incoming, mails, auto, forwards, to, another, email, address, i, use, yah...|
|[can, someone, tell, me, if, i, could, apply, for, permanent, residence, i, came, to, usa, 8, 1, ...|
|[do, you, believe, in, astrology, if, so, cick, on, my, name, and, you, will, see, my, questions,...|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



#### Stop Word Removal

In [11]:
remover = StopWordsRemover(inputCol="Tokens", outputCol="StopTokens")
df_removed = remover.transform(df_regex_token)
df_removed.select(['Document', 'StopTokens']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                             StopTokens|
+-------------------------------------------------------+-------------------------------------------------------+
|I'm trying to find the Url for this website similar ...|[m, trying, find, url, website, similar, myspace, ca...|
|In baseball game or just baseball,, wat does MRP and...|[baseball, game, baseball, wat, mrp, lrp, mean, mvp,...|
|how can i make my incoming mails auto forwards to an...|[make, incoming, mails, auto, forwards, another, ema...|
|Can someone tell me if I could apply for permanent r...|[someone, tell, apply, permanent, residence, came, u...|
|Do you believe in Astrology?If so, Cick on my name a...|[believe, astrology, cick, name, see, questions, ans...|
+-------------------------------------------------------+-------------------------------

#### NGrams

In [12]:
ngram = NGram(n=2, inputCol="Tokens", outputCol="NGrams")

df_ngrams = ngram.transform(df_removed)

df_ngrams.select(['Document', 'NGrams']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                                 NGrams|
+-------------------------------------------------------+-------------------------------------------------------+
|I'm trying to find the Url for this website similar ...|[i m, m trying, trying to, to find, find the, the ur...|
|In baseball game or just baseball,, wat does MRP and...|[in baseball, baseball game, game or, or just, just ...|
|how can i make my incoming mails auto forwards to an...|[how can, can i, i make, make my, my incoming, incom...|
|Can someone tell me if I could apply for permanent r...|[can someone, someone tell, tell me, me if, if i, i ...|
|Do you believe in Astrology?If so, Cick on my name a...|[do you, you believe, believe in, in astrology, astr...|
+-------------------------------------------------------+-------------------------------

In [13]:
remover = StopWordsRemover(inputCol="NGrams", outputCol="StopNGrams")
df_removed = remover.transform(df_ngrams)
df_removed.select(['Document', 'StopNGrams']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                             StopNGrams|
+-------------------------------------------------------+-------------------------------------------------------+
|I'm trying to find the Url for this website similar ...|[i m, m trying, trying to, to find, find the, the ur...|
|In baseball game or just baseball,, wat does MRP and...|[in baseball, baseball game, game or, or just, just ...|
|how can i make my incoming mails auto forwards to an...|[how can, can i, i make, make my, my incoming, incom...|
|Can someone tell me if I could apply for permanent r...|[can someone, someone tell, tell me, me if, if i, i ...|
|Do you believe in Astrology?If so, Cick on my name a...|[do you, you believe, believe in, in astrology, astr...|
+-------------------------------------------------------+-------------------------------

### Full Processing Pipeline

In [14]:
reg_tokenizer = RegexTokenizer(inputCol='Document', outputCol='Tokens', pattern='\\W')
stop_word_remover = StopWordsRemover(inputCol='Tokens', outputCol='StopTokens')
count_vec = CountVectorizer(inputCol='StopTokens', outputCol='CountVec')
idf = IDF(inputCol='CountVec', outputCol='TF-IDF')
feature = VectorAssembler(inputCols=['TF-IDF', 'Length'], outputCol='Features')

In [15]:
pipeline = Pipeline(stages=[
    reg_tokenizer,
    stop_word_remover,
    count_vec,
    idf,
    feature
])

In [16]:
pipeline = pipeline.fit(df)

In [17]:
df_processed = pipeline.transform(df)

In [26]:
feature_size = udf(lambda x: size(x), IntegerType())

In [25]:
df_processed['Features']

Column<'Features'>

In [18]:
df_processed.show(5)

+--------------------+-----+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            Document|  Set|Label|Length|              Tokens|          StopTokens|            CountVec|              TF-IDF|            Features|
+--------------------+-----+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|I'm trying to fin...|Train|    7|   112|[i, m, trying, to...|[m, trying, find,...|(262144,[9,14,19,...|(262144,[9,14,19,...|(262145,[9,14,19,...|
|In baseball game ...|Train|    6|   672|[in, baseball, ga...|[baseball, game, ...|(262144,[0,3,8,18...|(262144,[0,3,8,18...|(262145,[0,3,8,18...|
|how can i make my...|Train|    5|   197|[how, can, i, mak...|[make, incoming, ...|(262144,[11,16,36...|(262144,[11,16,36...|(262145,[11,16,36...|
|Can someone tell ...|Train|   10|  1256|[can, someone, te...|[someone, tell, a...|(262144,[0,1,4,8,...|(262144,[0,1,4

In [19]:
w = Window().orderBy(lit('A'))
df_processed = df_processed.withColumn("RowNum", row_number().over(w))

In [20]:
df_train = df_processed.where(col("RowNum").between(1, 100)).select('Features', 'Label')
df_test = df_processed.where(col("RowNum").between(190001, 190010)).select('Features', 'Label')

In [21]:
print(f'train: {df_train.count()} and test: {df_test.count()}')

train: 100 and test: 10


In [22]:
df_train.show(5)

+--------------------+-----+
|            Features|Label|
+--------------------+-----+
|(262145,[9,14,19,...|    7|
|(262145,[0,3,8,18...|    6|
|(262145,[11,16,36...|    5|
|(262145,[0,1,4,8,...|   10|
|(262145,[1,2,6,15...|    8|
+--------------------+-----+
only showing top 5 rows



### Classification - SVM

In [23]:
svm = LinearSVC()
ovr = OneVsRest(classifier=svm, featuresCol='Features', labelCol='Label', parallelism=10)

In [1]:
#ovrModel = ovr.fit(df_train)
#predictions = ovrModel.transform(df_test)

In [None]:
evaluator = MulticlassClassificationEvaluator()