# Preprocessing
* For each Wikipedia text and Wikidata description collected:

    tokenise the text, lowercase the tokens, remove function words

* Store the results in a pandas dataframe containing 5 columns:

    person, Wikipedia page text, Wikipedia page text after preprocessing, Wikidata description, Wikidata description after preprocessing

Note. To improve clustering and classification results, feel free to
add further pre-processing steps (eg Named entity recognition, postagging and extraction of e.g., nouns and verbs).

In [1]:
import json
import nltk
import string
import pandas as pd

from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/elisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def segment_and_tokenize(string):
    # Sentence splitting
    sentences = nltk.sent_tokenize(string)
    
    # tokenizing 
    tokenised_s = list(map(nltk.word_tokenize, sentences))

    return tokenised_s

In [3]:
def tokenize(sent):
    
    # tokenizing 
    tokenised_s = nltk.word_tokenize(sent)

    #print('Tokenised:', tokenised_s)

    return tokenised_s

In [4]:
# Define a translation table that maps each punctuation sign to the empty string
punct_removed = str.maketrans('','',string.punctuation)

# Define a function which segments, tokenizes and removes punctuation signs
def tokenize_no_punct(s):
    
    s = s.translate(punct_removed)

    # tokenizing 
    tokenised_s = segment_and_tokenize(s)

    #print('Tokenised without punctuation:', tokenised_s[:10])

    return tokenised_s

In [5]:
# lower case tokens and remove punctuation

def lower_case(c):
    return tokenize_no_punct(c.lower())

In [6]:
# remove function words

def preprocess(c):
    stop_words = set(stopwords.words('english'))
    
    lower_s = lower_case(c)

    no_stopw_c = [t for s in lower_s for t in s if t not in stop_words]

    #print('Preprocessed: ', no_stopw_c[:10])

    return no_stopw_c

In [23]:
with open('data.json', 'r') as f:
    data = json.load(f)

for cat,v in data.items():
    for keyw,articles in v.items():
        for art in articles:
            art['p_sentences'] = []
            art['p_description'] = preprocess(art['description'])
            for sent in art['sentences']:
                art['p_sentences'].append(preprocess(sent))

data['A']['singer'][0]['p_sentences'][0]

d', 'chebysheff', 'chebychov', 'chebyshov'], ['tchebychev', 'tchebycheff', 'french', 'transcriptions'], ['tschebyschev', 'tschebyschef', 'tschebyscheff', 'german', 'transcriptions'], ['rarely', 'čebyčev'], ['chebychev', 'mixture', 'english', 'french', 'transliterations', 'sometimes', 'erroneously', 'used']]
[['varāhamihira', 'c', '505', '–', 'c', '587', 'also', 'called', 'varāha', 'mihira', 'hindu', 'astrologer', 'astronomer', 'polymath', 'lived', 'ujjain', 'madhya', 'pradesh', 'india'], ['born', 'avanti', 'region', 'roughly', 'corresponding', 'modernday', 'malwa', 'part', 'madhya', 'pradesh', 'india', 'adityadasa'], ['according', 'one', 'works', 'educated', 'kapitthaka'], ['indian', 'tradition', 'believes', 'one', 'nine', 'jewels', 'navaratnas', 'court', 'ruler', 'yashodharman', 'vikramaditya', 'malwa'], ['however', 'claim', 'appears', 'first', 'time', 'much', 'later', 'text', 'scholars', 'consider', 'claim', 'doubtful', 'neither', 'varahamihira', 'vikramaditya', 'lived', 'century', '

['nicole',
 'rachel',
 'nikki',
 'yanofsky',
 'born',
 'february',
 '8',
 '1994',
 'jazzpop',
 'singer',
 'montreal',
 'quebec']

In [24]:
# store in a dataframe
#df = pd.DataFrame(columns=['person','description','token_description','content','token_content'])
person = []
description = []
p_description = []
sentences = []
p_sentences = []
category = []

for cat,v in data.items():
    for keyw,articles in v.items():
        for art in articles:
            person.append(art['title'])
            description.append(art['description'])
            p_description.append(art['p_description'])
            sentences.append(art['sentences'])
            p_sentences.append(art['p_sentences'])
            category.append(cat)

p_data = dict(person=person,
description=description,
p_description=p_description,
sentences=sentences,
p_sentences=p_sentences,
category=category)

p_data['category']

['A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'Z',
 'Z',
 'Z',
 'Z',
 'Z',
 'Z',
 'Z',
 'Z',
 'Z']

In [25]:
df = pd.DataFrame(p_data)
df.head()

Unnamed: 0,person,description,p_description,sentences,p_sentences,category
0,Nikki Yanofsky,"Nicole Rachel ""Nikki"" Yanofsky (born February ...","[nicole, rachel, nikki, yanofsky, born, februa...","[Nicole Rachel ""Nikki"" Yanofsky (born February...","[[nicole, rachel, nikki, yanofsky, born, febru...",A
1,Karna Das,Karna Das (Nepali: कर्ण दास) (born 24 November...,"[karna, das, nepali, कर्ण, दास, born, 24, nove...","[Karna Das (Nepali:, कर्ण दास) (born 24 Novemb...","[[karna, das, nepali], [कर्ण, दास, born, 24, n...",A
2,Joseph Schmidt,"Joseph Schmidt (March 4, 1904 – November 16, 1...","[joseph, schmidt, march, 4, 1904, –, november,...","[Joseph Schmidt (March 4, 1904 – November 16, ...","[[joseph, schmidt, march, 4, 1904, –, november...",A
3,Frances Brooke,Frances Brooke (née Moore; 12 January 1724 – 2...,"[frances, brooke, née, moore, 12, january, 172...","[Frances Brooke (née Moore;, 12 January 1724 –...","[[frances, brooke, née, moore], [12, january, ...",A
4,Henri Michaux,Henri Michaux (French: [miʃo]; 24 May 1899 – 1...,"[henri, michaux, french, miʃo, 24, may, 1899, ...","[Henri Michaux (French: [, miʃo];, 24 May 1899...","[[henri, michaux, french], [miʃo], [24, may, 1...",A
