# Preprocessing
* For each Wikipedia text and Wikidata description collected:

    tokenise the text, lowercase the tokens, remove function words

* Store the results in a pandas dataframe containing 5 columns:

    person, Wikipedia page text, Wikipedia page text after preprocessing, Wikidata description, Wikidata description after preprocessing

Note. To improve clustering and classification results, feel free to
add further pre-processing steps (eg Named entity recognition, postagging and extraction of e.g., nouns and verbs).

In [1]:
import json
import nltk
import string
import pandas as pd

from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/elisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def segment_and_tokenize(string):
    # Sentence splitting
    sentences = nltk.sent_tokenize(string)
    
    # tokenizing 
    tokenised_s = list(map(nltk.word_tokenize, sentences))

    return tokenised_s

In [3]:
def tokenize(sent):
    
    # tokenizing 
    tokenised_s = nltk.word_tokenize(sent)

    #print('Tokenised:', tokenised_s)

    return tokenised_s

In [4]:
# Define a translation table that maps each punctuation sign to the empty string
punct_removed = str.maketrans('','',string.punctuation)

# Define a function which segments, tokenizes and removes punctuation signs
def tokenize_no_punct(s):
    
    s = s.translate(punct_removed)

    # tokenizing 
    tokenised_s = segment_and_tokenize(s)

    #print('Tokenised without punctuation:', tokenised_s[:10])

    return tokenised_s

In [5]:
# lower case tokens and remove punctuation

def lower_case(c):
    return tokenize_no_punct(c.lower())

In [6]:
# remove function words

def preprocess(c):
    stop_words = set(stopwords.words('english'))
    
    lower_s = lower_case(c)

    no_stopw_c = [t for s in lower_s for t in s if t not in stop_words]

    #print('Preprocessed: ', no_stopw_c[:10])

    return no_stopw_c

In [7]:
with open('data/data.json', 'r') as f:
    data = json.load(f)

for cat,v in data.items():
    print('▶ cat:', cat)
    for keyw,articles in v.items():
        print('▶ keyw:', keyw)
        for art in articles:
            art['p_content'] = []
            try: # TODO BLOCK NONEs IN CORPUS EXTRACTION
                art['p_description'] = preprocess(art['description'])
            except AttributeError:
                art['p_description'] = None
            art['p_content'].append(preprocess(art['content']))

▶ cat: A
▶ keyw: singer
▶ keyw: writer
▶ keyw: painter
▶ cat: Z
▶ keyw: architect
▶ keyw: politician
▶ keyw: mathematician


In [8]:
data['A']['singer'][0]['p_description']

['bulgarian', 'popfolk', 'musician']

In [9]:
# store in a dataframe
#df = pd.DataFrame(columns=['person','description','token_description','content','token_content'])
person = []
description = []
p_description = []
content = []
p_content = []
label_2 = []
label_6 = []

for cat,v in data.items():
    for keyw,articles in v.items():
        for art in articles:
            person.append(art['title'])
            description.append(art['description'])
            p_description.append(art['p_description'])
            content.append(art['content'])
            p_content.append(art['p_content'])
            
            label_2.append(cat)
            label_6.append(keyw)

p_data = dict(person=person,
description=description,
p_description=p_description,
content=content,
p_content=p_content,
label_2=label_2,
label_6=label_6)

len(p_data['p_description'])

180

In [10]:
df = pd.DataFrame(p_data)
df.head()

Unnamed: 0,person,description,p_description,content,p_content,label_2,label_6
0,Kichka_Bodurova,Bulgarian pop-folk musician,"[bulgarian, popfolk, musician]","Big Brother: All-Stars, also known as Big Brot...","[[big, brother, allstars, also, known, big, br...",A,singer
1,Taras_Topolya,Ukrainian singer,"[ukrainian, singer]",EdCamp Ukraine (Ukrainian: ЕдКемп Україна) is ...,"[[edcamp, ukraine, ukrainian, едкемп, україна,...",A,singer
2,Louis_Graveure,"English actor and baritone singer known as ""Th...","[english, actor, baritone, singer, known, myst...",The Accusing Song (German: Ein Lied klagt an) ...,"[[accusing, song, german, ein, lied, klagt, 19...",A,singer
3,Philippe_Robrecht,Belgian guitarist and singer,"[belgian, guitarist, singer]",Jacques Romain Georges Brel (French: [ʒɑk ʁɔmɛ...,"[[jacques, romain, georges, brel, french, ʒɑk,...",A,singer
4,Moira_Lambert,British singer,"[british, singer]",Mary Rose Byrne (born 24 July 1979) is an Aust...,"[[mary, rose, byrne, born, 24, july, 1979, aus...",A,singer


In [None]:
categories = ['A','Z']
keywords = ['architect', 'mathematician', 'painter', 'politician', 'singer', 'writer']

df['label_2'] = df['label_2'].replace(categories,[n for n in range(len(categories))])
df['label_6'] = df['label_6'].replace(keywords,[n for n in range(len(keywords))])


In [11]:
df.to_csv('data/data.csv')