In [29]:
dataset = 'data/DSL-TRAIN.txt'

In [30]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import pandas as pd

In [31]:
# First let's see how many examples we have to work with
linecount = 0
with open(dataset, encoding='utf-8') as f:
    for line in f:
        linecount += 1
print(linecount)

252000


In [32]:
# Let's briefly examine two tokenizing methods, the treebank tokenizer and a simple RegEx based tokenizer
with open(dataset, encoding='utf-8') as f:
    text, lang = f.readline().split('\t')

In [33]:
TreebankWordTokenizer().tokenize(text)

['-',
 '57,5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [34]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(text)

['-',
 '57',
 ',5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [35]:
# Create DataFrame
columns = ['Text', 'Language']
df_dataset = pd.read_csv(dataset, header=None, index_col=False, sep=r'\t', encoding="utf-8")
df_dataset.columns = columns

  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
df_dataset.head()

Unnamed: 0,Text,Language
0,"- 57,5 miliona maloljetnih djevojčica prisilno...",bs
1,"- Nakon ovih kalkulacija, ubrzo je postalo jas...",bs
2,U okviru programa Modul Memorije Internacional...,bs
3,"Sagrađen je po istom principu kao i slični ""gr...",bs
4,"Kontroverzni biznismen Naser Kelmendi, koji se...",bs


In [37]:
df_dataset.loc[0][0]

'- 57,5 miliona maloljetnih djevojčica prisilno stupi u brak širom svijeta, dok čak 40 odsto od tog broja čine maloljetne Indijke.'

In [38]:
list(df_dataset.Language.unique())

['bs',
 'es-AR',
 'es-ES',
 'es-PE',
 'fa-AF',
 'fa-IR',
 'fr-CA',
 'fr-FR',
 'hr',
 'id',
 'my',
 'pt-BR',
 'pt-PT',
 'sr']

In [39]:
for lg in list(df_dataset.Language.unique()):
    print(lg, df_dataset[(df_dataset['Language']==lg)].count())

bs Text        18000
Language    18000
dtype: int64
es-AR Text        18000
Language    18000
dtype: int64
es-ES Text        18000
Language    18000
dtype: int64
es-PE Text        18000
Language    18000
dtype: int64
fa-AF Text        18000
Language    18000
dtype: int64
fa-IR Text        18000
Language    18000
dtype: int64
fr-CA Text        18000
Language    18000
dtype: int64
fr-FR Text        18000
Language    18000
dtype: int64
hr Text        18000
Language    18000
dtype: int64
id Text        18000
Language    18000
dtype: int64
my Text        18000
Language    18000
dtype: int64
pt-BR Text        18000
Language    18000
dtype: int64
pt-PT Text        18000
Language    18000
dtype: int64
sr Text        18000
Language    18000
dtype: int64


In [40]:
lgDict = {'bs': 'Balkan-Bosnian',
 'es-AR': 'Spanish-Argentine',
 'es-ES': 'Spanish-Spanish',
 'es-PE': 'Spanish-Peruvian',
 'fa-AF': 'Farsi-Dari',
 'fa-IR': 'Farsi-Persian',
 'fr-CA': 'French-Canadian',
 'fr-FR': 'French-French',
 'hr': 'Balkan-Croatian',
 'id': 'Bahasa-Indonesian',
 'my': 'Bahasa-Malaysian',
 'pt-BR': 'Portuguese-Brazilian',
 'pt-PT': 'Portuguese-Portuguese',
 'sr': 'Balkan-Serbian'}

In [41]:
with open('data/languages.txt', 'w') as lgfile:
    lgfile.write(str(list(df_dataset.Language.unique())))

In [13]:
ng = ngrams(tokens, 3)
[" ".join(x) for x in ng]

NameError: name 'tokens' is not defined

In [42]:
lgDict['id'].split('-')

['Bahasa', 'Indonesian']

In [43]:
lgs = list(df_dataset.Language.unique())
for lg in lgs:
    df2 = df_dataset[(df_dataset['Language']==lg)]
    df2['Code'] = df2['Language']
    if '-' not in lg:
        df2['Language'] = [lgDict[lg] for i in range(df2.shape[0])]
        df2['Dialect'] = [lgDict[lg] for i in range(df2.shape[0])]
    else:
        df2['Language'] = [lgDict[lg].split('-')[0] for i in range(df2.shape[0])]
        df2['Dialect'] = [lgDict[lg].split('-')[1] for i in range(df2.shape[0])]
    filename = "data/dataset-{}.csv".format(lg)
    df2.to_csv(filename, encoding="utf-16", index=False, sep="\t")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

In [25]:
ng = ngrams(df_read.loc[0]['Text'], 3)
[" ".join(x) for x in ng]

['ن ی ر',
 'ی ر و',
 'ر و ه',
 'و ه ا',
 'ه ا ی',
 'ا ی  ',
 'ی   ن',
 '  ن ظ',
 'ن ظ ا',
 'ظ ا م',
 'ا م ی',
 'م ی  ',
 'ی   س',
 '  س و',
 'س و م',
 'و م ا',
 'م ا ل',
 'ا ل ی',
 'ل ی  ',
 'ی   و',
 '  و  ',
 'و   ا',
 '  ا ت',
 'ا ت ح',
 'ت ح ا',
 'ح ا د',
 'ا د ی',
 'د ی ه',
 'ی ه  ',
 'ه   آ',
 '  آ ف',
 'آ ف ر',
 'ف ر ی',
 'ر ی ق',
 'ی ق ا',
 'ق ا  ',
 'ا   ب',
 '  ب ا',
 'ب ا  ',
 'ا   ب',
 '  ب ی',
 'ب ی ر',
 'ی ر و',
 'ر و ن',
 'و ن  ',
 'ن   ر',
 '  ر ا',
 'ر ا ن',
 'ا ن د',
 'ن د ن',
 'د ن \xa0',
 'ن \xa0  ',
 '\xa0   ا',
 '  ا س',
 'ا س ل',
 'س ل ا',
 'ل ا م',
 'ا م گ',
 'م گ ر',
 'گ ر ا',
 'ر ا ی',
 'ا ی ا',
 'ی ا ن',
 'ا ن  ',
 'ن   ا',
 '  ا ل',
 'ا ل ش',
 'ل ش ب',
 'ش ب ا',
 'ب ا ب',
 'ا ب ،',
 'ب ،  ',
 '،   ک',
 '  ک ن',
 'ک ن ت',
 'ن ت ر',
 'ت ر ل',
 'ر ل  ',
 'ل   ش',
 '  ش م',
 'ش م ا',
 'م ا ر',
 'ا ر ی',
 'ر ی  ',
 'ی   ا',
 '  ا ز',
 'ا ز  ',
 'ز   ش',
 '  ش ه',
 'ش ه ر',
 'ه ر ه',
 'ر ه ا',
 'ه ا ی',
 'ا ی  ',
 'ی   ج',
 '  ج ن',
 'ج ن و',
 'ن و ب',
 'و ب ی',
 