In [10]:
import os
import re
import csv

import numpy as np
import pandas as pd

In [11]:
#import dataset
#give the proper csv file path
dataset = pd.read_csv("BCC_Sports.csv", encoding="latin-1")
dataset

Unnamed: 0,text,label
0,Henman overcomes rival RusedskiTim Henman save...,tennis
1,Safin slumps to shock Dubai lossMarat Safin su...,tennis
2,Ferrero eyes return to top formFormer world nu...,tennis
3,Roddick into San Jose finalAndy Roddick will p...,tennis
4,Federer claims Dubai crownWorld number one Rog...,tennis
5,Young debut cut short by GinepriFifteen-year-o...,tennis
6,Melzer shocks AgassiSecond seed Andre Agassi s...,tennis
7,Federer forced to dig deepTop seed Roger Feder...,tennis
8,Nadal marches on in MexicoRafael Nadal continu...,tennis
9,Hantuchova in Dubai last eightDaniela Hantucho...,tennis


In [12]:
# instance per label
dataset.groupby('label').label.count()

label
athletics    101
cricket      124
football     265
rugby        147
tennis       100
Name: label, dtype: int64

In [13]:
# fix random seed for reproducibility
np.random.seed(7)

from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col 
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.mllib.classification import *

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [14]:
# Pre-Processng - cleaning and converting the data them into the right format

# Remove Non-Alphanumeric Characters
for row in range(len(dataset)):
    line = dataset.iloc[row,0]
    dataset.iloc[row,0] = re.sub("[^a-zA-Z0-9]", " ", line)
    
# Convert to Lowercase & SQL Dataframe
dataset_lower = dataset.apply(lambda x: x.astype(str).str.lower())

# Convert panda dataframe to sql dataframe
sql_dataset = sqlCtx.createDataFrame(dataset_lower)

# Tokenize Text into Words
Tokenizer = RegexTokenizer(inputCol="text",outputCol="words", pattern="\\W")

# countTokens = udf(lambda words: len(words), IntegerType())
regexTokenized = Tokenizer.transform(sql_dataset).select("words","label")

# Remove Stopwords
remove = StopWordsRemover(inputCol="words", outputCol="cleaned")
removed = remove.transform(regexTokenized).select("cleaned","label")

#  Stemming of words
# Conver sql dataframe back to pandas dataframe 
from nltk.stem import SnowballStemmer

stemming_removed = removed.toPandas()
stemmer = SnowballStemmer('english')
stemming_removed["words"] = stemming_removed["cleaned"].apply(lambda x: [stemmer.stem(y) for y in x])

# Convert back to sql dataframe
stemmed_dataset = sqlCtx.createDataFrame(stemming_removed).select(["label","words"])

# map the data to numeric values
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

indexer = StringIndexer(inputCol = "label", outputCol = "bin_label")
indexed = indexer.fit(stemmed_dataset).transform(stemmed_dataset)

# Split Data to Training and Test Set
# Split data into training(50%) and test(50%)
train, test = indexed.randomSplit([0.5, 0.5], seed=50)
train.cache()
test.cache()

DataFrame[label: string, words: array<string>, bin_label: double]

In [15]:
# stemmed dataset
stemmed_dataset.show(5)
indexed.groupBy("bin_label","label").count().orderBy('bin_label').show()

# datasets details
print("Number of Training Samples: "+ str(train.count()))
print("Number of Test Samples: "+ str(test.count()))



+------+--------------------+
| label|               words|
+------+--------------------+
|tennis|[henman, overcom,...|
|tennis|[safin, slump, sh...|
|tennis|[ferrero, eye, re...|
|tennis|[roddick, san, jo...|
|tennis|[feder, claim, du...|
+------+--------------------+
only showing top 5 rows

+---------+---------+-----+
|bin_label|    label|count|
+---------+---------+-----+
|      0.0| football|  265|
|      1.0|    rugby|  147|
|      2.0|  cricket|  124|
|      3.0|athletics|  101|
|      4.0|   tennis|  100|
+---------+---------+-----+

Number of Training Samples: 359
Number of Test Samples: 378


In [7]:
# apply Term Frequency, Inverse Document Frequecy Method to determine importance of a word 

tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures = 10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)

train_tf = tf.transform(train)
test_tf = tf.transform(test)

idfModel = idf.fit(train_tf)
tfidf_train = idfModel.transform(train_tf).select("bin_label", "features")

In [8]:
from pyspark.ml.linalg import DenseVector

rescaled_train = tfidf_train.rdd.map(lambda lp: (lp.bin_label, DenseVector(lp.features.toArray()))) \
                                     .toDF(['label','features'])
rescaled_test = test_tf.rdd.map(lambda lp: (lp.bin_label, DenseVector(lp.rawFeatures.toArray()))) \
                                .toDF(['label','features'])

In [16]:
from pyspark.ml.classification import NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics

# naive bayes classifier is used for the text classifiction
nb = NaiveBayes(smoothing=0.01, modelType="multinomial")
model = nb.fit(rescaled_train)

test_prob = model.transform(rescaled_test)
test_pred = test_prob.select('prediction','label').rdd.map(lambda lp: (lp.prediction,lp.label))
test_metrics = MulticlassMetrics(test_pred)

train_prob = model.transform(rescaled_train)
train_pred = train_prob.select('prediction','label').rdd.map(lambda lp: (lp.prediction,lp.label))
train_metrics = MulticlassMetrics(train_pred)
print("Training Accuracy: " + str(train_metrics.accuracy*100)+ " %" )
print("Training Precision: " + str(train_metrics.precision()))
print("Training Recall: " + str(train_metrics.recall()))


print("Testing Accuracy: " + str(test_metrics.accuracy*100)+ " %" )
print("Testing Precision: " + str(test_metrics.precision()))
print("Testing Precision: " + str(test_metrics.recall()))


Training Accuracy: 100.0 %
Training Precision: 1.0
Training Recall: 1.0
Testing Accuracy: 96.56084656084656 %
Testing Precision: 0.9656084656084656
Testing Precision: 0.9656084656084656
