# Preprocessing
* For each Wikipedia text and Wikidata description collected:

    tokenise the text, lowercase the tokens, remove function words

* Store the results in a pandas dataframe containing 5 columns:

    person, Wikipedia page text, Wikipedia page text after preprocessing, Wikidata description, Wikidata description after preprocessing

Note. To improve clustering and classification results, feel free to
add further pre-processing steps (eg Named entity recognition, postagging and extraction of e.g., nouns and verbs).

In [7]:
import json
import nltk
import string
import pandas as pd

from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/elisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def segment_and_tokenize(string):
    # Sentence splitting
    sentences = nltk.sent_tokenize(string)
    
    # tokenizing 
    tokenised_s = list(map(nltk.word_tokenize, sentences))

    return tokenised_s

In [9]:
def tokenize(sent):
    
    # tokenizing 
    tokenised_s = nltk.word_tokenize(sent)

    #print('Tokenised:', tokenised_s)

    return tokenised_s

In [10]:
# Define a translation table that maps each punctuation sign to the empty string
punct_removed = str.maketrans('','',string.punctuation)

# Define a function which segments, tokenizes and removes punctuation signs
def tokenize_no_punct(s):
    
    s = s.translate(punct_removed)

    # tokenizing 
    tokenised_s = segment_and_tokenize(s)

    #print('Tokenised without punctuation:', tokenised_s[:10])

    return tokenised_s

In [11]:
# lower case tokens and remove punctuation

def lower_case(c):
    return tokenize_no_punct(c.lower())

In [12]:
# remove function words

def preprocess(c):
    stop_words = set(stopwords.words('english'))
    
    lower_s = lower_case(c)

    no_stopw_c = [t for s in lower_s for t in s if t not in stop_words]

    #print('Preprocessed: ', no_stopw_c[:10])

    return no_stopw_c

In [26]:
with open('data.json', 'r') as f:
    data = json.load(f)

for cat,v in data.items():
    print('▶ cat:', cat)
    for keyw,articles in v.items():
        print('▶ keyw:', keyw)
        for art in articles:
            art['p_content'] = []
            if art['description'] is not None: # TODO BLOCK NONEs IN CORPUS EXTRACTION 
                art['p_description'] = preprocess(art['description'])            
            art['p_content'].append(preprocess(art['content']))

▶ cat: A
▶ keyw: singer
▶ keyw: writer
▶ keyw: painter
▶ cat: Z
▶ keyw: architect
▶ keyw: politician
▶ keyw: mathematician


In [39]:
data['A']['singer'][0]['p_description']

['american', 'singersongwriter']

In [30]:
# store in a dataframe
#df = pd.DataFrame(columns=['person','description','token_description','content','token_content'])
person = []
description = []
p_description = []
content = []
p_content = []
category = []

for cat,v in data.items():
    for keyw,articles in v.items():
        for art in articles:
            person.append(art['title'])
            description.append(art['description'])
            p_description.append(art['p_description'])
            content.append(art['content'])
            p_content.append(art['p_content'])
            category.append(cat)

p_data = dict(person=person,
description=description,
p_description=p_description,
content=content,
p_content=p_content,
category=category)

p_data['category']

KeyError: 'p_description'

In [None]:
df = pd.DataFrame(p_data)
df.head()