In [1]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import json
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from bs4.element import Tag
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
def remove_hidden(l):
    return [el for el in l if el[0] != "."]

def get_relative_path_to_dirs(start_path):
    subdirs = [x[1] for x in os.walk(start_path)][0]
    subdirs = remove_hidden(subdirs)
    subdirs = [start_path + "/" + subdir for subdir in subdirs]
    return subdirs

def get_relative_path_to_files(start_path):
    files = [f for f in listdir(start_path) if isfile(join(start_path, f))]
    files = remove_hidden(files)
    files = [start_path + "/" + file for file in files]
    return files

In [3]:
home_articles_directory = "../articles"

subdirs = get_relative_path_to_dirs(home_articles_directory)
subdirs

['../articles/medium',
 '../articles/splinters',
 '../articles/thehistoryblog',
 '../articles/tutorialspoint',
 '../articles/chemistry-blog',
 '../articles/wikihow',
 '../articles/kdnuggets',
 '../articles/smartdatacollective']

## Read dataset

In [4]:
dataset = []

for subdir in subdirs:
    subsubdirs = get_relative_path_to_dirs(subdir)
    for subsubdir in subsubdirs:
        onlyfiles = get_relative_path_to_files(subsubdir)
        read_json_list = []
        for file in onlyfiles:
            with open(file, 'r') as infile:
                d = json.load(infile)
                dataset.append(d)

In [5]:
#print(dataset[-121]["content"])

## Filter articles

In [6]:
from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()

def ok_title(title):
    num_words = len(title.split(" "))
    ok_num_words = num_words >= 2 and num_words <= 20
    ok_alphabet = ad.only_alphabet_chars(title, "LATIN")
    return ok_num_words and ok_alphabet

def ok_content(content):
    num_words = len(content.split(" "))
    ok_num_words = num_words >= 100
    ok_alphabet = ad.only_alphabet_chars(content, "LATIN")
    return ok_num_words and ok_alphabet

In [7]:
dataset_copy = []
not_ok_title = 0
not_ok_content = 0
for i,sample in enumerate(dataset):
    if ok_title(sample["title"]) and ok_content(sample["content"]):
        dataset_copy.append(sample)
    if not ok_title(sample["title"]):
        not_ok_title += 1
    if not ok_content(sample["content"]):
        not_ok_content += 1
        
print("Prev length: {0}".format(len(dataset)))
print("New length: {0}".format(len(dataset_copy)))
print("Dropped total: {0}".format(len(dataset) - len(dataset_copy)))
print("\tDropped title: {0}".format(not_ok_title))
print("\tDropped content: {0}".format(not_ok_content))

dataset = dataset_copy

Prev length: 1386
New length: 1089
Dropped total: 297
	Dropped title: 6
	Dropped content: 291


## Clean contents

In [8]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub("(<!--.*?-->)", "", cleantext, flags=re.DOTALL)
    return cleantext

def remove_newlines(content):
    return content.replace("\n", " ")

def remove_extra_white_spaces(content):
    content = re.sub(' +', ' ', content)
    content = content.strip()
    return content

def remove_urls(content):
    content = re.sub(r'https?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    content = re.sub(r'http?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    return content

def remove_code(content):
    content = re.sub(r'(\w+(\.\w+)*\([^\)]*\))', '', content, flags=re.MULTILINE) # matches a.b.c(d)
    return content

def remove_alt_html(content):
    content = content.split("&lt")[0]
    return content

def clean_content(content):
    content = clean_html(content)
    content = remove_newlines(content)
    content = remove_extra_white_spaces(content)
    content = remove_urls(content)
    content = remove_code(content)
    content = remove_alt_html(content)
    return content

In [9]:
for sample in dataset:
    sample["content"] = clean_content(sample["content"])

## TF-IDF

In [10]:
idf = pd.read_csv("../resources/wiki-30k-10-IDF.csv")
idf = idf.set_index('term')

In [12]:
print("Number of words considered in wikipedia: {0}".format(idf.shape[0]))

Number of words considered in wikipedia: 87709


In [13]:
import os
import sys

import nltk
import nltk.data
from nltk.tag.perceptron import PerceptronTagger
from nltk.probability import FreqDist

# Set tokenizers, tagger and stemmer
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
sentTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = nltk.stem.snowball.EnglishStemmer()
tagger = PerceptronTagger()

import pandas as pd
import string
import re

In [20]:
"""
def to_token_list(content):
    textList = nltk.word_tokenize(content)
    tokenList = []
    for token in textList:
        try:
            thisToken = token
            uselessUnicode = [u'\u2013', u'\u2014', u'\u201d', u'\u201c'] ### don't include these when they are alone
            if thisToken not in uselessUnicode:
                thisToken = thisToken.replace(u'\u201d','') # delete this (unicode quote)
                thisToken = thisToken.replace(u'\u201c','') # delete this (unicode quote)
                tokenList.append(thisToken)
        except:
            tokenList.append('**CODEC_ERROR**')
            # #######################prints word on CODEC ERROR
            print('**CODEC_ERROR**')
            print(token) 
            print('****')
    return tokenList
"""

import string
punctuation = set(string.punctuation)
import re

def add_to_stem_dictionary(stemmed_word, word, stem_dictionary):
    if stemmed_word not in stem_dictionary:
        stem_dictionary[stemmed_word] = {word: 1}
    else:
        d = stem_dictionary[stemmed_word]
        if word not in d:
            d[word] = 1
        else:
            d[word] += 1

def cleanTokens(tokenList, stem_dictionary):
    # Convert all text to lower case
    textList = [word.lower() for word in tokenList]
    
    # Remove punctuation
    textList = [word for word in textList if word not in punctuation]
    textList = ["".join(c for c in word if c not in punctuation) for word in textList ]
    
    # Convert digits into NUM
    textList = [re.sub("\d+", "NUM", word) for word in textList]  
    
    # Stem words 
    stemmedTextList = [stemmer.stem(word) for word in textList]
    for sw,w in zip(stemmedTextList, textList):
        add_to_stem_dictionary(sw, w, stem_dictionary)
    textList = stemmedTextList
    
    # Remove blanks
    textList = [word for word in textList if word != ' ']
    textList = [word for word in textList if word != '']
    
    # Remove short words
    textList = [word for word in textList if len(word) > 2]
    
    return textList

In [22]:
def from_sample_to_tfidf(sample, stem_dictionary):
    # From text to tokens
    #tl = to_token_list(sample)
    tl = nltk.word_tokenize(sample) # splits "I am Fabio" into ["I", "am", "Fabio"]. It's a little smarter than a .split(" ")
    raw_text = ' '.join(tl) # Join back the tokens with a space between them
    tokens = cleanTokens(tl, stem_dictionary)
    
    ## create FreqDF with word frequencies
    freq = FreqDist(tokens)
    
    # convert it to a data frame
    freqDF = pd.DataFrame.from_dict(freq, orient='index')
    freqDF.columns = ['freq']
    
    ## merge freqDF with idf data frame
    freqit = freqDF.join(idf[['idf', 'logidf']])
    
    # replace null values with max
    maxidf = max(freqit['idf'].dropna())
    maxlogidf = max(freqit['logidf'].dropna())
    freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
    freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf
    
    ## create tfidf columns
    freqit['tfidf'] = freqit['freq'] * freqit['idf']
    freqit['logtfidf'] = freqit['freq'] * freqit['logidf']
    
    ## order by logtfidf weight
    freqit = freqit.sort_values(by='logtfidf', ascending=False) 
    
    return freqit

In [23]:
stem_dictionary = {}

for i,sample in enumerate(dataset):
    sample["tfidf"] = from_sample_to_tfidf(sample["content"], stem_dictionary).to_dict()
    if i % 50 == 0:
        print(i)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050


## Assign read time

In [24]:
wps = 200 / 60 # = 3.33 

for el in dataset:
    num_of_words = len(el["content"].split(" "))
    read_time = num_of_words / wps
    el["read_time"] = read_time

## Save preprocessed data

In [25]:
for i,sample in enumerate(dataset):
    with open("../preprocessed/" + str(i) + '.json', 'w') as outfile:
        json.dump(sample, outfile)

#### Save stem_dictionary

In [42]:
with open('../stemmer/stem_dictionary.json', 'w') as outfile:
    json.dump(stem_dictionary, outfile)