In [1]:
dataset = 'data/DSL-TRAIN.txt'

In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import pandas as pd
from collections import Counter
from collections import OrderedDict
import copy

In [3]:
charset = set()
with open(dataset, encoding='utf-8') as f:
    for line in f:
        charset.update(line)

In [4]:
len(charset)

368

In [5]:
skip_chars = set(('!',  '"',  '#',  '$',  '%',  '&',  "'",  '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
                  ':',  ';',  '<',  '=',  '>',  '?',  '@',  '[',  ']',  '^',  '_',  '`',  '{',  '|',  '}', 
                  '~',  '\x7f',  '\xa0',  '¡',  '¢',  '£',  '¤',  '¥',  '¦',  '§',  '¨',  'ª',  '«',  '¬',
                  '®',  '¯',  '°',  '²',  '³',  '´',  '¶',  '·',  'º',  '»',  '¼',  '½',  '¾',  '¿',  '˚',
                  '˜', '˝', '̀', '́', '̂', '̃', '̧', '̷', '\u2009', '\u200b', '\u200c', '\u200d', '\u200e', '\u200f',
                  '‐', '‑', '–', '—', '―', '‘', '’', '‚', '“', '”', '„', '•', '…', '\u202a', '\u202b', '\u202c',
                  '′', '″', '›', '₂', '€', '™', '→', '−', '∙', '■', '▪', '▶', '●', '♦', '\ufeff', '�', '\t', '\n'
                 ))

In [10]:
charset = sorted(list([char for char in charset if char not in skip_chars]))

In [11]:
len(charset)

261

In [12]:
zero_vec = OrderedDict((token, 0) for token in charset)

In [13]:
zero_vec

OrderedDict([(' ', 0),
             ('0', 0),
             ('1', 0),
             ('2', 0),
             ('3', 0),
             ('4', 0),
             ('5', 0),
             ('6', 0),
             ('7', 0),
             ('8', 0),
             ('9', 0),
             ('A', 0),
             ('B', 0),
             ('C', 0),
             ('D', 0),
             ('E', 0),
             ('F', 0),
             ('G', 0),
             ('H', 0),
             ('I', 0),
             ('J', 0),
             ('K', 0),
             ('L', 0),
             ('M', 0),
             ('N', 0),
             ('O', 0),
             ('P', 0),
             ('Q', 0),
             ('R', 0),
             ('S', 0),
             ('T', 0),
             ('U', 0),
             ('V', 0),
             ('W', 0),
             ('X', 0),
             ('Y', 0),
             ('Z', 0),
             ('a', 0),
             ('b', 0),
             ('c', 0),
             ('d', 0),
             ('e', 0),
             ('f', 0),
           

In [14]:
columns = ['Text', 'Language']
df_dataset = pd.read_csv(dataset, header=None, index_col=False, sep=r'\t', encoding="utf-8")
df_dataset.columns = columns

  


In [37]:
import time
t = time.time()
id_vectors = []
for lg in ['id']:
    lg_idf = copy.copy(zero_vec)
    lg_cfidf_vectors = []
    temp_df = df_dataset[df_dataset['Language']==lg]['Text']
    for text in temp_df:
        text = set([char for char in text if char in charset])
        for char in text:
            lg_idf[char] += 1
    for k, v in lg_idf.items():
        lg_idf[k] = v/len(temp_df)
    for text in temp_df:
        vec = copy.copy(zero_vec)
        tokens = [char for char in text if char in charset]
        text_counter = Counter(tokens)
        for k, v in text_counter.items():
            lines_containing_key = lg_idf[k]
            cf = v / len(text)
            if lines_containing_key:
                idf = len(temp_df) / lines_containing_key
            else:
                idf = 0
            vec[k] = cf * idf
        lg_cfidf_vectors.append(vec)
    id_vectors.append((lg, lg_cfidf_vectors))
print(time.time()-t)

10.108915567398071


In [16]:
print("k {}, v {}, len(temp_df) {}, lg_idf[k] {}, lines_containing_key {}, cf {}, idf {}".format(k, v, len(temp_df), lg_idf[k], lines_containing_key, cf, idf))

k i, v 22, len(temp_df) 18000, lg_idf[k] 0.9983333333333333, lines_containing_key 0.9983333333333333, cf 0.061452513966480445, idf 18030.050083472455


In [17]:
text_counter.most_common()

[('a', 62),
 (' ', 47),
 ('i', 22),
 ('e', 21),
 ('n', 20),
 ('r', 14),
 ('m', 13),
 ('l', 12),
 ('t', 11),
 ('k', 10),
 ('u', 10),
 ('h', 9),
 ('b', 9),
 ('d', 8),
 ('g', 8),
 ('p', 7),
 ('s', 6),
 ('y', 6),
 ('K', 6),
 ('o', 5),
 ('j', 5),
 ('1', 5),
 ('H', 4),
 ('A', 3),
 ('S', 3),
 ('7', 2),
 ('J', 1),
 ('M', 1),
 ('4', 1),
 ('O', 1),
 ('f', 1),
 ('Y', 1),
 ('P', 1),
 ('w', 1),
 ('c', 1)]

In [18]:
sorted(vec.items(), key=lambda x: x[1], reverse=True)

[('a', 3119.398034443819),
 (' ', 2363.2597838303245),
 ('7', 1271.9999057777848),
 ('i', 1107.991904570933),
 ('H', 1063.800097515009),
 ('e', 1059.338196988951),
 ('1', 1019.8646979500719),
 ('n', 1007.6578889505025),
 ('r', 707.8430760587996),
 ('K', 669.8122113932851),
 ('m', 661.2726578513726),
 ('Y', 622.868501693664),
 ('l', 615.1073400277937),
 ('t', 556.659990078832),
 ('k', 507.5871749640459),
 ('u', 506.3093331249755),
 ('4', 504.1938345186038),
 ('h', 477.3074360766508),
 ('b', 474.14001959648675),
 ('O', 463.6413590988186),
 ('g', 409.63074759191807),
 ('d', 407.6243364309847),
 ('p', 362.21815498720736),
 ('A', 342.94351381617804),
 ('j', 332.6819338924032),
 ('y', 331.6335408431271),
 ('s', 304.7061106428013),
 ('o', 280.5070459214275),
 ('S', 262.3776380829804),
 ('J', 208.4837440591785),
 ('f', 160.55134521215078),
 ('M', 114.1702955671621),
 ('w', 105.00382097237429),
 ('P', 94.67809739103399),
 ('c', 91.3984985821949),
 ('0', 0),
 ('2', 0),
 ('3', 0),
 ('5', 0),
 ('6

In [26]:
columns = charset + ['Language']
this_df = pd.DataFrame(columns = columns)
this_df.loc[0] = list(vec.values()) + [lg]

In [27]:
this_df

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,...,۱,۲,۳,۴,۵,۶,۷,۸,۹,Language
0,2363.259784,0,1019.864698,0,0,504.193835,0,0,1271.999906,0,...,0,0,0,0,0,0,0,0,0,id


In [29]:
this_df.to_csv('id.csv', encoding='utf-16', sep="\t")

In [33]:
test = pd.read_csv('id.csv', encoding='utf-16', sep="\t", index_col=0)

In [34]:
test

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,...,۱,۲,۳,۴,۵,۶,۷,۸,۹,Language
0,2363.259784,0,1019.864698,0,0,504.193835,0,0,1271.999906,0,...,0,0,0,0,0,0,0,0,0,id


In [45]:
columns = charset + ['Language']
this_df = pd.DataFrame(columns = columns)
all_lg_vectors = []
for lg in list(df_dataset.Language.unique()):
    t = time.time()
    print(lg, t)
    lg_idf = copy.copy(zero_vec)
    lg_cfidf_vectors = []
    temp_df = df_dataset[df_dataset['Language']==lg]['Text']
    for text in temp_df:
        text = set([char for char in text if char in charset])
        for char in text:
            lg_idf[char] += 1
    print("done")
    for k, v in lg_idf.items():
        lg_idf[k] = v/len(temp_df)
    print("done")
    for i, text in enumerate(temp_df):
        vec = copy.copy(zero_vec)
        tokens = [char for char in text if char in charset]
        text_counter = Counter(tokens)
        for k, v in text_counter.items():
            lines_containing_key = lg_idf[k]
            cf = v / len(text)
            if lines_containing_key:
                idf = len(temp_df) / lines_containing_key
            else:
                idf = 0
            vec[k] = cf * idf
        lg_cfidf_vectors.append(vec)
    all_lg_vectors.append(lg_cfidf_vectors)
    print("done, ", time.time()-t)
    print("----------")

bs 1565881510.9515011
done
done
done,  9.002484798431396
----------
es-AR 1565881519.9550033
done
done
done,  9.99352478981018
----------
es-ES 1565881529.9535255
done
done
done,  11.116546154022217
----------
es-PE 1565881541.0700717
done
done
done,  7.1871254444122314
----------
fa-AF 1565881548.2571971
done
done
done,  15.4144766330719
----------
fa-IR 1565881563.6716738
done
done
done,  21.20604395866394
----------
fr-CA 1565881584.8777177
done
done
done,  8.068223476409912
----------
fr-FR 1565881592.9459412
done
done
done,  9.489352941513062
----------
hr 1565881602.4352942
done
done
done,  12.430476665496826
----------
id 1565881614.8667705
done
done
done,  10.70905089378357
----------
my 1565881625.5758214
done
done
done,  7.871630907058716
----------
pt-BR 1565881633.4474523
done
done
done,  9.960795879364014
----------
pt-PT 1565881643.4082482
done
done
done,  12.328917503356934
----------
sr 1565881655.7371657
done
done
done,  9.372217416763306
----------


In [46]:
len(all_lg_vectors)

14

In [47]:
len(all_lg_vectors[0])

18000

In [48]:
len(all_lg_vectors[0][0])

261