In [1]:
dataset = 'data/DSL-TRAIN.txt'

In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import pandas as pd

In [68]:
# First let's see how many examples we have to work with
linecount = 0
with open(dataset, encoding='utf-8') as f:
    for line in f:
        linecount += 1
print(linecount)

252000


In [82]:
# Let's briefly examine two tokenizing methods, the treebank tokenizer and a simple RegEx based tokenizer
with open(dataset, encoding='utf-8') as f:
    text, lang = f.readline().split('\t')

In [83]:
TreebankWordTokenizer().tokenize(text)

['-',
 '57,5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [38]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(text)

['-',
 '57',
 ',5',
 'miliona',
 'maloljetnih',
 'djevojčica',
 'prisilno',
 'stupi',
 'u',
 'brak',
 'širom',
 'svijeta',
 ',',
 'dok',
 'čak',
 '40',
 'odsto',
 'od',
 'tog',
 'broja',
 'čine',
 'maloljetne',
 'Indijke',
 '.']

In [3]:
# Create DataFrame
columns = ['Text', 'Language']
df_dataset = pd.DataFrame(columns=columns)

In [4]:
# Now load the tokenized examples into the DataFrame
with open(dataset, encoding='utf-8') as f:
    for i in range(252000):
        text, lang = f.readline().split('\t')
        df_dataset.loc[i] = [text, lang[:-1]]
        if i % 10000 == 0:
            print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000


In [51]:
df_dataset.loc[251999]['Language']='sr'

In [52]:
list(df_dataset.Language.unique())

['bs',
 'es-AR',
 'es-ES',
 'es-PE',
 'fa-AF',
 'fa-IR',
 'fr-CA',
 'fr-FR',
 'hr',
 'id',
 'my',
 'pt-BR',
 'pt-PT',
 'sr']

In [34]:
lgDict = {'bs': 'Bosnian',
 'es-AR': 'Spanish-Argentine',
 'es-ES': 'Spanish-Spanish',
 'es-PE': 'Spanish-Peruvian',
 'fa-AF': 'Farsi-Dari',
 'fa-IR': 'Farsi-Persian',
 'fr-CA': 'French-Canadian',
 'fr-FR': 'French-French',
 'hr': 'Croatian',
 'id': 'Bahasa-Indonesian',
 'my': 'Bahasa-Malaysian',
 'pt-BR': 'Portuguese-Brazilian',
 'pt-PT': 'Portuguese-Portuguese',
 'sr': 'Serbian'}

In [30]:
with open('data/languages.txt', 'w') as lgfile:
    lgfile.write(str(list(df_dataset.Language.unique())))

In [46]:
ng = ngrams(tokens, 3)
[" ".join(x) for x in ng]

['- 57,5 miliona',
 '57,5 miliona maloljetnih',
 'miliona maloljetnih djevojčica',
 'maloljetnih djevojčica prisilno',
 'djevojčica prisilno stupi',
 'prisilno stupi u',
 'stupi u brak',
 'u brak širom',
 'brak širom svijeta',
 'širom svijeta ,',
 'svijeta , dok',
 ', dok čak',
 'dok čak 40',
 'čak 40 odsto',
 '40 odsto od',
 'odsto od tog',
 'od tog broja',
 'tog broja čine',
 'broja čine maloljetne',
 'čine maloljetne Indijke',
 'maloljetne Indijke .']

In [36]:
lgDict['id'].split('-')

['Bahasa', 'Indonesian']

In [53]:
lgs = list(df_dataset.Language.unique())
for lg in lgs:
    df2 = df_dataset[(df_dataset['Language']==lg)]
    df2['Code'] = df2['Language']
    if '-' not in lg:
        df2['Language'] = [lgDict[lg] for i in range(df2.shape[0])]
        df2['Dialect'] = [lgDict[lg] for i in range(df2.shape[0])]
    else:
        df2['Language'] = [lgDict[lg].split('-')[0] for i in range(df2.shape[0])]
        df2['Dialect'] = [lgDict[lg].split('-')[1] for i in range(df2.shape[0])]
    filename = "data/dataset-{}.csv".format(lg)
    df2.to_csv(filename, encoding="utf-16", index=False, sep="\t")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

In [71]:
df3 = df_dataset[(df_dataset['Language']=='fa-IR')]
df3.shape

(18000, 2)

In [38]:
df3['Code'] = df3['Language']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
df3['Language'] = [lgDict['hr'] for i in range(df3.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
df3.head()

Unnamed: 0,Text,Language,Code
144000,"Predstavljene su sjemenske sorte PIONEER, a pr...",Croatian,hr
144001,Još sam 2003. ili 2004. imao pravo na saborsku...,Croatian,hr
144002,U utorak su pretraženi stanovi i druge prostor...,Croatian,hr
144003,Više od 5200 osoba pobjeglo je iz okolice vulk...,Croatian,hr
144004,Samsung je redizajnirao TV zaslon tako što je ...,Croatian,hr


In [25]:
df3.to_csv("test.csv", encoding="utf-16", index=False, sep="\t")

In [66]:
df_read= pd.read_csv('data/dataset-fa-IR.csv', encoding="utf-16", sep="\t")

In [67]:
df_read.head()

Unnamed: 0,Text,Language,Code,Dialect
0,نیروهای نظامی سومالی و اتحادیه آفریقا با بیرون...,Farsi,fa-IR,Persian
1,روز دوشنبه، احمد داوود اوغلو وزیر امورخارجه تر...,Farsi,fa-IR,Persian
2,فرمامدهی مرکزی آمریکا، که بر عملیات نظامی آمری...,Farsi,fa-IR,Persian
3,بسیاری از افرادی‌که گفته‌اند دنبال کار نمی‌گرد...,Farsi,fa-IR,Persian
4,کاخ سفيد نيز در اطلاعيه ای به مردم ايران تسليت...,Farsi,fa-IR,Persian


In [70]:
ng = ngrams(df_read.loc[0]['Text'], 3)
[" ".join(x) for x in ng]

['ن ی ر',
 'ی ر و',
 'ر و ه',
 'و ه ا',
 'ه ا ی',
 'ا ی  ',
 'ی   ن',
 '  ن ظ',
 'ن ظ ا',
 'ظ ا م',
 'ا م ی',
 'م ی  ',
 'ی   س',
 '  س و',
 'س و م',
 'و م ا',
 'م ا ل',
 'ا ل ی',
 'ل ی  ',
 'ی   و',
 '  و  ',
 'و   ا',
 '  ا ت',
 'ا ت ح',
 'ت ح ا',
 'ح ا د',
 'ا د ی',
 'د ی ه',
 'ی ه  ',
 'ه   آ',
 '  آ ف',
 'آ ف ر',
 'ف ر ی',
 'ر ی ق',
 'ی ق ا',
 'ق ا  ',
 'ا   ب',
 '  ب ا',
 'ب ا  ',
 'ا   ب',
 '  ب ی',
 'ب ی ر',
 'ی ر و',
 'ر و ن',
 'و ن  ',
 'ن   ر',
 '  ر ا',
 'ر ا ن',
 'ا ن د',
 'ن د ن',
 'د ن \xa0',
 'ن \xa0  ',
 '\xa0   ا',
 '  ا س',
 'ا س ل',
 'س ل ا',
 'ل ا م',
 'ا م گ',
 'م گ ر',
 'گ ر ا',
 'ر ا ی',
 'ا ی ا',
 'ی ا ن',
 'ا ن  ',
 'ن   ا',
 '  ا ل',
 'ا ل ش',
 'ل ش ب',
 'ش ب ا',
 'ب ا ب',
 'ا ب ،',
 'ب ،  ',
 '،   ک',
 '  ک ن',
 'ک ن ت',
 'ن ت ر',
 'ت ر ل',
 'ر ل  ',
 'ل   ش',
 '  ش م',
 'ش م ا',
 'م ا ر',
 'ا ر ی',
 'ر ی  ',
 'ی   ا',
 '  ا ز',
 'ا ز  ',
 'ز   ش',
 '  ش ه',
 'ش ه ر',
 'ه ر ه',
 'ر ه ا',
 'ه ا ی',
 'ا ی  ',
 'ی   ج',
 '  ج ن',
 'ج ن و',
 'ن و ب',
 'و ب ی',
 

In [69]:
df_read.loc[0]['Text']

'نیروهای نظامی سومالی و اتحادیه آفریقا با بیرون راندن\xa0 اسلامگرایان الشباب، کنترل شماری از شهرهای جنوبی و مرزی سومالی را در دست گرفتند.'

In [73]:
df_read.loc[0]['Text']

'نیروهای نظامی سومالی و اتحادیه آفریقا با بیرون راندن\xa0 اسلامگرایان الشباب، کنترل شماری از شهرهای جنوبی و مرزی سومالی را در دست گرفتند.'