In [2]:
dataset = 'data/DSL-TRAIN.txt'

In [47]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import pandas as pd

In [68]:
# First let's see how many examples we have to work with
linecount = 0
with open(dataset, encoding='utf-8') as f:
    for line in f:
        linecount += 1
print(linecount)

252000


In [82]:
# Let's briefly examine two tokenizing methods, the treebank tokenizer and a simple RegEx based tokenizer
with open(dataset, encoding='utf-8') as f:
    text, lang = f.readline().split('\t')

In [83]:
TreebankWordTokenizer().tokenize(text)

['-',
 '57,5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [38]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(text)

['-',
 '57',
 ',5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [54]:
# Create DataFrame
columns = ['Text', 'Language']
df_dataset = pd.DataFrame(columns=columns)

In [69]:
# Now load the tokenized examples into the DataFrame
with open(dataset, encoding='utf-8') as f:
    for i in range(50000):
        text, lang = f.readline().split('\t')
        tokens = TreebankWordTokenizer().tokenize(text)
        df_dataset.loc[i] = [tokens, lang[:-1]]

In [81]:
df_dataset.Language.unique()

array(['bs', 'es-AR', 'es-ES'], dtype=object)

In [46]:
ng = ngrams(tokens, 3)
[" ".join(x) for x in ng]

['- 57,5 miliona',
 '57,5 miliona maloljetnih',
 'miliona maloljetnih djevojčica',
 'maloljetnih djevojčica prisilno',
 'djevojčica prisilno stupi',
 'prisilno stupi u',
 'stupi u brak',
 'u brak širom',
 'brak širom svijeta',
 'širom svijeta ,',
 'svijeta , dok',
 ', dok čak',
 'dok čak 40',
 'čak 40 odsto',
 '40 odsto od',
 'odsto od tog',
 'od tog broja',
 'tog broja čine',
 'broja čine maloljetne',
 'čine maloljetne Indijke',
 'maloljetne Indijke .']