## Preparación de datos

In [2]:
%python
from pyspark.sql.functions import regexp_replace, concat_ws
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer, RegexTokenizer
import pyspark.sql.functions as f
from collections import defaultdict
# File location and type
file_path = "dbfs:///FileStore/tables/articles*.csv"
df = spark.read.csv(file_path, header="true", inferSchema="true").select("id", "title", "content")
df = df.filter(df.content.isNotNull())#Removing null values
df = df.filter(df.title.isNotNull())#Removing null values
df = df.withColumn('content', regexp_replace('content', '[^0-9a-zA-Z]+', ' '))#remove special characteres
df = df.withColumn('content', regexp_replace('content', '(?:^| )\w(?:$| )', ' '))#remove single words
#Removing stopwords
tokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")
tokenized = tokenizer.transform(df).select('id', 'title' ,'words')
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cleanedDataFrame = remover.transform(tokenized).select('id', 'title' ,'filtered')
cleanedDataFrame = cleanedDataFrame.withColumn('filtered', concat_ws(' ', cleanedDataFrame.filtered))
frame = cleanedDataFrame.toPandas()

## Índice invertido

In [4]:
dictionary = defaultdict(list)
for i, content in enumerate(frame['filtered']):
    article = str(frame['id'].iloc[i]) +","+ str(frame['title'].iloc[i])
    for word in str(content).split():
        if dictionary.get(word) == None:
            dictionary[word].append([1, article])
        else:
            if dictionary[word][-1][1] == article:
                dictionary[word][-1][0] += 1
            else:
                dictionary[word].append([1, article])

## Búsqueda

In [6]:
word = "trump"
word = word.lower()
totalCount = 0
for i in range(len(dictionary[word])):
        totalCount += dictionary[word][i][0]
sortedList = sorted(dictionary[word], key = lambda k: k[0], reverse=True)
if len(sortedList) >= 10:
        for i in range(10):
                print(sortedList[i])
else:
        print(sortedList)
print(word + " is " + str(totalCount) + " times in all the news.")

## Agrupamiento

In [8]:
grouping = defaultdict(dict)
for i, content in enumerate(frame['filtered']):
    id_ = str(frame['id'].iloc[i])
    words = {}
    for word in str(content).split():
      if word not in words:
          words[word] = 1
      else:
          words[word] += 1
    words = sorted(words.items(), key=lambda kv: kv[1], reverse = True)[:10]
    grouping[id_] = words

## Similitud

In [10]:
id_ = '17295'
news = {}
title = cleanedDataFrame.filter(cleanedDataFrame.id == id_).collect()[0][1]
wordsN = grouping.get(id_)
dictOfWords = { i[0] : i[1] for i in wordsN }
for item in grouping.items():
  cont=0
  for word in item[1]:
    if word[0] in dictOfWords:
      cont += word[1]+dictOfWords[word[0]]
  news[item[0]]=cont
news = sorted(news.items(),key=lambda kv: kv[1], reverse = True)[:10]
ids_ = []

for i in news:
  ids_.append(i[0])
print(id_, title, ids_)