In [1]:
import pandas as pd
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
file_names = os.listdir('./training/clean/')
# Create Dictionary for language (= File name) and Text -- rename the files such as: en_clean.txt = english etc.
file_name_and_text = {}
for file in file_names:
    with open('./training/clean/' + file, "r",) as target_file:
         file_name_and_text[file] = target_file.read()
file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'language', 0: 'wikitext'}))
file_data.head()

Unnamed: 0,language,wikitext
0,german,Paris Paris ist die Hauptstadt der Französisc...
1,romanian,Avrig Avrig în dialectul săsesc Frek Fraek în...
2,english,History of the Jews in Romania The history of...
3,french,La Roche-sur-Yon La Roche-sur-Yon est une com...


In [3]:
# Function to Tokenize words
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

file_data['wikitext_tokenized'] = file_data['wikitext'].apply(lambda x: tokenize(x.lower())) 
#We convert to lower as Python is case-sensitive. 

file_data.head()

Unnamed: 0,language,wikitext,wikitext_tokenized
0,german,Paris Paris ist die Hauptstadt der Französisc...,"[paris, paris, ist, die, hauptstadt, der, fran..."
1,romanian,Avrig Avrig în dialectul săsesc Frek Fraek în...,"[avrig, avrig, în, dialectul, săsesc, frek, fr..."
2,english,History of the Jews in Romania The history of...,"[history, of, the, jews, in, romania, the, his..."
3,french,La Roche-sur-Yon La Roche-sur-Yon est une com...,"[la, roche-sur-yon, la, roche-sur-yon, est, un..."


In [4]:
sw1 = set(stopwords.words('romanian'))
sw2 = set(stopwords.words('english'))
sw3 = set(stopwords.words('german'))
sw4 = set(stopwords.words('french'))
stop_words=set()
stop_words=stop_words.union(sw1,sw2,sw3,sw4)
print(stop_words)


{'j', 'can', 'which', 'elle', 'meu', 'nişte', 'desi', 'sois', 'aia', 'isi', 'unseren', 'serions', 'welchen', 'oricare', 'man', 'pai', 'au', 'own', 'so', "shouldn't", 'wouldn', 'même', 'other', 'wollte', 'lângă', 'ihre', 'keinem', 'iar', 'mustn', 'zu', 'oriunde', 'avut', 'who', 'cam', 'putini', 'că', 'is', 'einen', 'könnte', 'derselben', 'mâine', 'her', 'sie', 'does', 'vostru', 'primul', 'acestei', 'altii', 'le', 'ţie', "weren't", 'damit', 'nur', 'şi', 'său', 'catre', 'ta', 'nichts', "wasn't", "you've", 'dintr-', 'v', 'such', 'étantes', 'eut', 'nous', 'cît', 'she', "should've", 'should', 'don', 'had', 'va', 'having', 'avec', 'iti', 'acei', 'sa', 'than', 'totusi', 'ain', 'war', 'sunt', 'pour', "hadn't", 'indem', 'eure', 'acestea', 'este', "she's", 'über', 'jede', 'fiecare', 'ni', 'tine', 'cea', "couldn't", 'by', 'zwischen', 'einem', 'acestui', 'ai', 'im', 'mai', 'îi', 'eu', 'acelea', 'pot', 'him', 'voştri', 'eures', 'soit', 'mon', 'manche', 'oder', 'hatten', 'an', 'now', 'étais', 'seriez

In [5]:
# Function to remove Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stop_words]# To remove all stopwords
    return text

file_data['wikitext_nostop'] = file_data['wikitext_tokenized'].apply(lambda x: remove_stopwords(x))
file_data.head()

Unnamed: 0,language,wikitext,wikitext_tokenized,wikitext_nostop
0,german,Paris Paris ist die Hauptstadt der Französisc...,"[paris, paris, ist, die, hauptstadt, der, fran...","[paris, paris, hauptstadt, französischen, repu..."
1,romanian,Avrig Avrig în dialectul săsesc Frek Fraek în...,"[avrig, avrig, în, dialectul, săsesc, frek, fr...","[avrig, avrig, dialectul, săsesc, frek, fraek,..."
2,english,History of the Jews in Romania The history of...,"[history, of, the, jews, in, romania, the, his...","[history, jews, romania, history, jews, romani..."
3,french,La Roche-sur-Yon La Roche-sur-Yon est une com...,"[la, roche-sur-yon, la, roche-sur-yon, est, un...","[roche-sur-yon, roche-sur-yon, commune, centre..."
