# Spark NLP - News Headline Categorization

In [0]:
# Install Pyspark & Setup our Java Environment - Takes around 15 seconds

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Note please go to https://spark.apache.org/downloads.html
# And check to see if the version number has been updated, if so update the link below with the new version number
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()

In [0]:
from pyspark.ml import Pipeline 
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
spark = SparkSession.builder.master("local[*]").appName("Spark NLP News Data").getOrCreate()

In [0]:
# Obtain our data

news_data = spark.read.csv(s3_bucket_path,header = 'True',inferSchema='True')

In [0]:
news_data.show()

In [0]:
news_data.count()

In [0]:
title_category = news_data.select("TITLE","CATEGORY")

In [0]:
title_category.show()


Let's check null values in TITLE and CATEGORY columns

In [0]:
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [0]:
null_columns_count_list = null_value_count(title_category)

In [0]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()


There are 389 empty titles and 516 categories

Let's drop/delete the null values

In [0]:
title_category = title_category.dropna()

In [0]:
title_category.count()

In [0]:
title_category.show(truncate=False)

In [0]:
title_category.select("Category").distinct().count()


Top 20 news categories

In [0]:
title_category.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

Top 20 news title

In [0]:
title_category.groupBy("TITLE").count().orderBy(col("count").desc()).show(truncate=False)

## Data Cleaning
1. Removing numbers from titles



In [0]:
title_category = title_category.withColumn("only_str",regexp_replace(col('TITLE'), '\d+', ''))

In [0]:
title_category.select("TITLE","only_str").show(truncate=False)

Split text into words

In [0]:
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)

In [0]:
raw_words.show()


Remove stop words

In [0]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)

In [0]:
words_df.select("words","filtered").show(truncate=False)


Now lets encode column of category to a column of category indices

In [0]:
feature_data.select("CATEGORY","categoryIndex").show()


Convert text into vectors of token counts

In [0]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(words_label_index)
countVectorizer_feateures = model.transform(words_label_index)


Create our Training & Test datasets

In [0]:
(trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2],seed = 11)

## Train our Model and Evaluate it's performance

In [0]:
nb = NaiveBayes(modelType="multinomial",labelCol="categoryIndex", featuresCol="features")
nbModel = nb.fit(trainingData)
nb_predictions = nbModel.transform(testData)

In [0]:
nb_predictions.select("prediction", "categoryIndex", "features").show(5)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))
print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy))