In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, udf
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer
import string
import re

spark = SparkSession.builder.getOrCreate()

In [2]:
def getDataFrame(filename):
    customSchema = StructType([ \
        StructField("_id", IntegerType(), False), \
        StructField("_published-at", DateType(), True), \
        StructField("_title", StringType(), True), \
        StructField("content", StringType(), False)])

    df = spark.read \
        .format('com.databricks.spark.xml') \
        .options(rowTag='article') \
        .load(filename, schema = customSchema)
    
    return df

In [15]:
def mergeTitleContent(dataframe):
    return dataframe.select("_id", concat(dataframe['_title'], lit(' '), dataframe['content'])\
                    .alias('text'))\
                    .dropna()

In [50]:
regex = re.compile('[^0-9a-zA-Z]+')
def cleanWords(x):
    _id = x._id
    words = [w for w in x.words if w != '' and any([ch.isalpha() for ch in w]) ]
    ret_words = []
    for word in words:
        if word == 'u.s.' or word == 'f.b.i': #frequent and maybe relevant in news
            ret_words.append(word)
            break
        splitted = regex.split(word)
        ret_words.extend([w for w in splitted if len(w)>2])
    return (_id, ret_words)

In [51]:
def textTokenize(dataframe):
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokened = tokenizer.transform(dataframe).rdd
    processed = tokened.map(cleanWords)
    return spark.createDataFrame(processed,StructType([ \
        StructField("_id",IntegerType(),False), \
        StructField("words",ArrayType(StringType(),True),False)]))

In [62]:
articles = getDataFrame("../dataset/wellformed/articles-training-byarticle.xml")
articlesTexts = mergeTitleContent(articles)
articlesWords = textTokenize(articlesTexts)
articlesWords.write.save('../dataset/tokened/articles-training-byarticle', format='parquet', mode="error")