# Yahoo Topic Classification

## Setting up Libraries and Environments

In [1]:
# Basic
import pandas as pd
import numpy as np
import findspark
import pyspark
from pyspark import SparkFiles

# Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# ML
from pyspark.ml.feature import *
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
findspark.init()

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

## Data Import

In [3]:
schema_list = [
    StructField('Label', StringType(), False), 
    StructField('Title', StringType(), True), 
    StructField('Content', StringType(), True),
    StructField('Answer', StringType(), True),
    StructField('Set', StringType(), True)
]
schema_df = StructType(fields=schema_list)

In [4]:
# Import Data from reduced folders
df = spark.read.csv("../data/reduced", schema=schema_df)

## Data Preview
This includes some basic view of the data before processing

In [5]:
df.show(5)

+-----+--------------------+--------+--------------------+-----+
|Label|               Title| Content|              Answer|  Set|
+-----+--------------------+--------+--------------------+-----+
|    1|what is the frenc...|    null|Are you talking a...|Train|
|    2|Do we realy need ...|    null|When surfaces are...|Train|
|    5|How can I save my...|    null|"Its 9 way to hac...|Train|
|    7|I'm trying to fin...|help me.|        tagworld.com|Train|
|    7|define Stability ...|    null|A Linear Time Inv...|Train|
+-----+--------------------+--------+--------------------+-----+
only showing top 5 rows



In [6]:
df.describe().show()

+-------+-----------------+--------------------+-----------------------------------+----------------------+------+
|summary|            Label|               Title|                            Content|                Answer|   Set|
+-------+-----------------+--------------------+-----------------------------------+----------------------+------+
|  count|           364615|              364615|                             200607|                357904|364615|
|   mean|5.494765711778177|                null|                           Infinity|              Infinity|  null|
| stddev|2.872483870744461|                null|                                NaN|                   NaN|  null|
|    min|                1|    ! only 4 girls!?|                                  !|  ! - 3 - 7 . Even ...|  Test|
|    max|                9|Describe in your...|黙れこのくそやろう\nむかつくんだ...|혼돈\nhttp://dictio...| Train|
+-------+-----------------+--------------------+-----------------------------------+-------------

## Data Processing

### Steps to Take
- Merge Test and Train
- Rename columns
- Change data types
- Tokenization
- Stop Word Removal
- NGram

#### Column Concatenation

In [7]:
df = df.fillna('')

In [8]:
df = df.withColumn('Document', concat(df['Title'], df['Content'], df['Answer'])).select(['Document', 'Set', 'Label'])
df.show(5)

+--------------------+-----+-----+
|            Document|  Set|Label|
+--------------------+-----+-----+
|what is the frenc...|Train|    1|
|Do we realy need ...|Train|    2|
|How can I save my...|Train|    5|
|I'm trying to fin...|Train|    7|
|define Stability ...|Train|    7|
+--------------------+-----+-----+
only showing top 5 rows



In [9]:
before = df.count()
before_train = df.filter('Set == "Train"').count()
before_test = df.filter('Set == "Test"').count()
df = df.filter('Document is NOT NULL').withColumn('Length', length(df['Document']))
after = df.count()
after_train = df.filter('Set == "Train"').count()
after_test = df.filter('Set == "Test"').count()

In [10]:
print(f'''There were {before_train} training samples and {before_test} test samples''')
print(f'{before - after} rows were dropped, {before_train - after_train} training samples and {before_test - after_test} test samples.')
print(f'''There are now {df.filter('Set == "Train"').count()} training samples and {df.filter('Set == "Test"').count()} test samples''')

There were 349605 training samples and 15010 test samples
0 rows were dropped, 0 training samples and 0 test samples.
There are now 349605 training samples and 15010 test samples


#### Tokenization (Regex)

In [11]:
regex_tokenizer = RegexTokenizer(inputCol='Document', outputCol='Tokens', pattern='\\W')

df_regex_token = regex_tokenizer.transform(df)

df_regex_token.select('Tokens').show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                              Tokens|
+----------------------------------------------------------------------------------------------------+
|               [what, is, the, french, name, for, arm, cover, are, you, talking, about, a, gauntlet]|
|[do, we, realy, need, oil, to, run, machines, when, surfaces, are, rubbed, against, each, other, ...|
|[how, can, i, save, my, yahoo, id, to, the, hackers, tell, me, please, how, the, hacker, attack, ...|
|[i, m, trying, to, find, the, url, for, this, website, similar, to, myspace, called, tag, how, do...|
|[define, stability, of, linear, time, invariant, systems, a, linear, time, invariant, system, lti...|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



#### Stop Word Removal

In [12]:
remover = StopWordsRemover(inputCol="Tokens", outputCol="StopTokens")
df_removed = remover.transform(df_regex_token)
df_removed.select(['Document', 'StopTokens']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                             StopTokens|
+-------------------------------------------------------+-------------------------------------------------------+
|what is the french name for arm cover?Are you talkin...|          [french, name, arm, cover, talking, gauntlet]|
|Do we realy need Oil to run machines?When surfaces a...|[realy, need, oil, run, machines, surfaces, rubbed, ...|
|How can I save my yahoo id to the hackers?Tell me pl...|[save, yahoo, id, hackers, tell, please, hacker, att...|
|I'm trying to find the Url for this website similar ...|[m, trying, find, url, website, similar, myspace, ca...|
|define Stability of Linear Time-Invariant Systems?A ...|[define, stability, linear, time, invariant, systems...|
+-------------------------------------------------------+-------------------------------

#### NGrams

In [13]:
ngram = NGram(n=2, inputCol="Tokens", outputCol="NGrams")

df_ngrams = ngram.transform(df_removed)

df_ngrams.select(['Document', 'NGrams']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                                 NGrams|
+-------------------------------------------------------+-------------------------------------------------------+
|what is the french name for arm cover?Are you talkin...|[what is, is the, the french, french name, name for,...|
|Do we realy need Oil to run machines?When surfaces a...|[do we, we realy, realy need, need oil, oil to, to r...|
|How can I save my yahoo id to the hackers?Tell me pl...|[how can, can i, i save, save my, my yahoo, yahoo id...|
|I'm trying to find the Url for this website similar ...|[i m, m trying, trying to, to find, find the, the ur...|
|define Stability of Linear Time-Invariant Systems?A ...|[define stability, stability of, of linear, linear t...|
+-------------------------------------------------------+-------------------------------

In [14]:
remover = StopWordsRemover(inputCol="NGrams", outputCol="StopNGrams")
df_removed = remover.transform(df_ngrams)
df_removed.select(['Document', 'StopNGrams']).show(5,truncate=55)

+-------------------------------------------------------+-------------------------------------------------------+
|                                               Document|                                             StopNGrams|
+-------------------------------------------------------+-------------------------------------------------------+
|what is the french name for arm cover?Are you talkin...|[what is, is the, the french, french name, name for,...|
|Do we realy need Oil to run machines?When surfaces a...|[do we, we realy, realy need, need oil, oil to, to r...|
|How can I save my yahoo id to the hackers?Tell me pl...|[how can, can i, i save, save my, my yahoo, yahoo id...|
|I'm trying to find the Url for this website similar ...|[i m, m trying, trying to, to find, find the, the ur...|
|define Stability of Linear Time-Invariant Systems?A ...|[define stability, stability of, of linear, linear t...|
+-------------------------------------------------------+-------------------------------

### Full Processing Pipeline

In [15]:
reg_tokenizer = RegexTokenizer(inputCol='Document', outputCol='Tokens', pattern='\\W')
stop_word_remover = StopWordsRemover(inputCol='Tokens', outputCol='StopTokens')
count_vec = CountVectorizer(inputCol='StopTokens', outputCol='CountVec')
idf = IDF(inputCol='CountVec', outputCol='TF-IDF', minDocFreq=10)
string_indexer = StringIndexer(inputCol='Label', outputCol='LabelString')
#feature = VectorAssembler(inputCols=['TF-IDF', 'Length'], outputCol='Features')

In [17]:
pipeline = Pipeline(stages=[
    reg_tokenizer,
    stop_word_remover,
    count_vec,
    idf,
    string_indexer#,
    #feature
])

In [18]:
pipeline = pipeline.fit(df)

In [19]:
df_processed = pipeline.transform(df)

In [20]:
df_processed.show(5)

+--------------------+-----+-----+------+--------------------+--------------------+--------------------+--------------------+-----------+
|            Document|  Set|Label|Length|              Tokens|          StopTokens|            CountVec|              TF-IDF|LabelString|
+--------------------+-----+-----+------+--------------------+--------------------+--------------------+--------------------+-----------+
|what is the frenc...|Train|    1|    71|[what, is, the, f...|[french, name, ar...|(262144,[74,321,1...|(262144,[74,321,1...|        0.0|
|Do we realy need ...|Train|    2|  1124|[do, we, realy, n...|[realy, need, oil...|(262144,[0,12,16,...|(262144,[0,12,16,...|        4.0|
|How can I save my...|Train|    5|   438|[how, can, i, sav...|[save, yahoo, id,...|(262144,[0,1,9,22...|(262144,[0,1,9,22...|        3.0|
|I'm trying to fin...|Train|    7|   112|[i, m, trying, to...|[m, trying, find,...|(262144,[13,14,15...|(262144,[13,14,15...|        8.0|
|define Stability ...|Train|    7|

In [21]:
w = Window().orderBy(lit('A'))
df_processed = df_processed.withColumn("RowNum", row_number().over(w))

In [22]:
df_train = df_processed.where(col("RowNum").between(1, 100)).select('TF-IDF', 'LabelString')
df_test = df_processed.where(col("RowNum").between(190001, 190010)).select('TF-IDF', 'LabelString')

In [23]:
print(f'train: {df_train.count()} and test: {df_test.count()}')

train: 100 and test: 10


In [26]:
df_train.show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+-----------+
|                                                                                              TF-IDF|LabelString|
+----------------------------------------------------------------------------------------------------+-----------+
|(262144,[74,321,1021,1157,1715,40575],[3.152856704547272,4.0007752110684045,5.375892941426043,5.2...|        0.0|
|(262144,[0,12,16,26,27,35,57,80,182,185,221,282,317,399,424,487,507,533,593,603,664,747,762,791,8...|        4.0|
|(262144,[0,1,9,22,28,32,43,49,50,60,85,101,155,203,219,238,325,344,365,386,392,495,517,529,648,88...|        3.0|
|(262144,[13,14,15,20,121,168,174,659,859,2554,3225,34583],[2.215078340262761,2.3194174994370207,2...|        8.0|
|(262144,[10,151,610,978,1211,1501,1688,1956,2174,2953,4092,5793,6034,34070],[4.311569836225594,3....|        8.0|
+-------------------------------------------------------------------------------

### Classification - SVM

In [29]:
lr = LogisticRegression()

In [27]:
svm = LinearSVC()
ovr = OneVsRest(classifier=svm, featuresCol='TF-IDF', labelCol='LabelString')

In [28]:
ovrModel = ovr.fit(df_train)
predictions = ovrModel.transform(df_test)

In [29]:
predictions.show()

+--------------------+-----------+--------------------+----------+
|              TF-IDF|LabelString|       rawPrediction|prediction|
+--------------------+-----------+--------------------+----------+
|(262144,[2,7,8,9,...|        9.0|[-1.2539359408700...|       9.0|
|(262144,[15,24,61...|        6.0|[-1.2619786496484...|       6.0|
|(262144,[38,436,7...|        6.0|[-0.9123263291335...|       8.0|
|(262144,[38,53,56...|        0.0|[-0.6404599342468...|       6.0|
|(262144,[12,26,33...|        5.0|[-0.4117734576774...|       6.0|
|(262144,[0,15,24,...|        3.0|[-1.3991276548908...|       3.0|
|(262144,[0,6,14,2...|        7.0|[-0.5473978473741...|       5.0|
|(262144,[61,67,81...|        1.0|[-1.2725461662587...|       1.0|
|(262144,[0,7,9,13...|        3.0|[-0.7537007991925...|       3.0|
|(262144,[7,16,50,...|        0.0|[-0.9184679012203...|       9.0|
+--------------------+-----------+--------------------+----------+



In [32]:
evaluator = MulticlassClassificationEvaluator(labelCol='LabelString')

In [33]:
print(f'Accuracy {evaluator.evaluate(predictions)}')

Accuracy 0.44666666666666666


In [31]:
ovrModel = ovr.fit(df_train)
predictions = ovrModel.transform(df_test)

In [32]:
predictions.show()

+--------------------+-----+--------------------+----------+
|              TF-IDF|Label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|(262144,[2,7,8,9,...|    9|[-Infinity,-19.59...|       9.0|
|(262144,[15,24,61...|    4|[-Infinity,-19.36...|       4.0|
|(262144,[38,436,7...|    4|[-Infinity,-14.85...|       4.0|
|(262144,[38,53,56...|    1|[-Infinity,-14.03...|       4.0|
|(262144,[12,26,33...|    3|[-Infinity,-5.015...|       4.0|
|(262144,[0,15,24,...|    5|[-Infinity,-18.57...|       5.0|
|(262144,[0,6,14,2...|   10|[-Infinity,-12.51...|       3.0|
|(262144,[61,67,81...|    6|[-Infinity,-18.32...|       6.0|
|(262144,[0,7,9,13...|    5|[-Infinity,-15.06...|       5.0|
|(262144,[7,16,50,...|    1|[-Infinity,-17.91...|       4.0|
+--------------------+-----+--------------------+----------+



In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol='Label')

In [40]:
print(f'Accuracy {evaluator.evaluate(predictions)}')

Accuracy 0.5142857142857143
