In [1]:
import json
import pickle
import re
import time
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
from random import shuffle, randint, choice
from collections import Counter
from utils import HOMEPATH
from googletrans import Translator

In [2]:
translator = Translator()

In [3]:
def translating(langsents, lang):
    '''Translator function'''
    step = 30
    sents = []
    interval = len(langsents) // step + 1
    f = IntProgress(description=f'{lang}:', min=1, max=interval + 1, style= {'description_width': 'initial'})
    display(f)

    for i in range(interval):
        if (i + 1) * step < len(langsents):
            text = '\n'.join(langsents[i * step:(i + 1) * step])
        else:
            text = '\n'.join(langsents[i * step:])
            if not text:
                break
        sent = translator.translate(text, src='en', dest=lang).text
        sents.extend(sent.split('\n'))
        time.sleep(randint(9, 30))
        f.value += 1
    return sents



In [4]:
def writesents(sents, lang, category):
    '''Recording sents'''
    with open(f'{HOMEPATH}{category}_{lang}_sents.json', 'w', encoding='utf8') as file:
        json.dump(sents, file, ensure_ascii=False)

In [5]:
def loadsents(lang, category):
    '''Reading sents'''
    with open(f'{HOMEPATH}{category}_{lang}_sents.json',  encoding='utf8') as file:
        return json.load(file)


In [6]:
def sentmerger(cleansents):
    '''Merge sentences before translating'''
    res = []
    for sent in cleansents:
        string = ''
        for token in sent:
            if string and re.match(r'(?i)[\[\]<>a-z0-9-]+|<\$%\|', token[0]):
                string += ' '
            string += token[0]
        string = string.replace('  ', ' ')
        res.append(string)
    return res

In [7]:
def pack_translate(source, category):
    dataset = [source]
    for language in ['ru', 'pl', 'be', 'uk', 'bg', 'sl', 'cs']:
        resultsents = None
        while not resultsents:
            try:
                resultsents = translating(source, language)
            except:
                print('Connection time out. Retry...')
                time.sleep(5)
        writesents(resultsents, language, category)
        dataset.append(resultsents)
    df = pd.DataFrame(zip(*dataset), columns=['en', 'ru', 'pl', 'be', 'uk', 'bg', 'sl', 'cs'])
    df.to_csv(f'{HOMEPATH}{category}_{len(df)}sents.csv', index=False)
    pickle.dump(df, open(f'{HOMEPATH}{category}_{len(df)}sents', 'wb'))
    return df

Main dataset loading

In [8]:
'''Download and prepare WikiNERen'''
with open(f'{HOMEPATH}aij-wikiner-en-wp2', encoding='utf8') as file:
    data = file.readlines()

for i in range(len(data)):
    data[i] = [t.split('|') for t in data[i].strip().split()]

Zero version (straight translations)

In [45]:
shuffle(data)

In [9]:
'''Version B1 sents'''
num = 0
cleansents = []
dct = {'I-PER': 0, 'I-LOC': 1, 'I-ORG': 2}
for i in range(len(data)):
    if num >= 10000:
        break
    labels = set((t[2] for t in data[i]))
    if labels.issubset({'I-PER', 'I-LOC', 'I-ORG', 'O'}):
        for j in range(len(data[i])):
            if data[i][j][2] in {'I-PER', 'I-LOC', 'I-ORG'}:
                data[i][j][0] = f'{data[i][j][0]} |{dct[data[i][j][2]]}|'
        cleansents.append(data[i])
        num += 1

print(len(cleansents))

10000


In [10]:
zerosents = sentmerger(cleansents)

In [11]:
zerosents[:5]

['',
 'In the end, for anarchist historian Daniel |0| Guerin |0|" Some anarchists are more individualistic than social, some more social than individualistic.',
 'From this climate William |0| Godwin |0| developed what many consider the first expression of modern anarchist thought.',
 'Godwin |0| was, according to Peter |0| Kropotkin |0|," the first to formulate the political and economical conceptions of anarchism, even though he did not give that name to the ideas developed in his work", while Godwin |0| attached his anarchist ideas to an early Edmund |0| Burke |0|.',
 "Proudhon |0|'s followers, the mutualists, opposed Marx |0|'s state socialism, advocating political abstentionism and small property holdings."]

In [13]:
writesents(zerosents, 'en', 'ZERO')

In [None]:
zerodata = pack_translate(zerosents, 'ZEROX')

In [59]:
zerodata.to_excel(f'{HOMEPATH}ZEROX_9947sentsT.xlsx')

B2 version

In [11]:
cleansents = []
for i in range(len(data)):
    labels = Counter((t[2] for t in data[i]))
    if len(labels) == 2 and labels['I-PER'] <= 2:
        app = False
        for j in range(len(data[i])):
            try:
                if data[i][j][2] == 'I-PER' and data[i][j][0].endswith("'s"):
                    continue
                if data[i][j][2] == 'I-PER' and data[i][j + 1][1] == 'VBD':
                    data[i][j][0] += ' |0|'
                    if j - 1 >= 0:
                        if data[i][j - 1][2] == 'I-PER':
                            data[i][j - 1][0] += ' |0|'
                    app = True
                elif data[i][j][2] == 'I-PER':
                    data[i][j][0] += ' |0|'
                
            except IndexError:
                continue
        if app:
            cleansents.append(data[i])

print(len(cleansents))

10936


In [12]:
persres = sentmerger(cleansents)

In [14]:
for i in range(len(persres)):
    persres[i] = persres[i].replace('|0| |0|', '|0|')

In [13]:
for i in range(len(persres)):
    if not i % 9:
        c = '[woman]'
    else:
        c = '[man]'
    if 'his' in persres[i]:
        c = '[man]'
    elif ' her' in persres[i]:
        c = '[woman]'
    persres[i] = persres[i].replace('[???]', c)

In [15]:
'''PER recording'''
with open(f'{HOMEPATH}perssents.json', 'w', encoding='utf8') as file:
    json.dump(persres, file, ensure_ascii=False)

In [64]:
'''PER reading'''
with open(f'{HOMEPATH}perssents.json', encoding='utf8') as file:
    persres = json.load(file)

In [None]:
persents = pack_translate(persres, 'ZERORIGHT')

Joker edits

In [3]:
rusents = loadsents('ru', 'PER')
plsents = loadsents('pl', 'PER')
uksents = loadsents('uk', 'PER')
besents = loadsents('be', 'PER')
bgsents = loadsents('bg', 'PER')
slsents = loadsents('sl', 'PER')
cssents = loadsents('cs', 'PER')


In [4]:
with open(f'{HOMEPATH}perssents.json') as file:
    ensents = json.load(file)

In [9]:
persents = list(zip(plsents, besents, uksents, rusents, bgsents, slsents, cssents, ensents))

In [10]:
resclean = []
for sents in persents:
    pol = set(re.findall(r'\[(?:człowiek.*?|kobiet.*?|mężczyzn.*?)\]', sents[0].lower()))
    if not pol.issubset({'[człowiek]', '[kobieta]', '[mężczyzna]'}) or not pol:
        continue
    bel = set(re.findall(r'\[(?:чалавек.*?|жанчын.*?|мужчын.*?)\]', sents[1].lower()))
    if not bel.issubset({'[чалавек]', '[жанчына]', '[мужчына]'}) or not bel:
        continue
    uk = set(re.findall(r'\[(?:людин.*?|жінк.*?|чоловік.*?)\]', sents[2].lower()))
    if not uk.issubset({'[людина]', '[жінка]', '[чоловік]'}) or not uk:
        continue
    rus = set(re.findall(r'\[(?:человек.*?|женщин.*?|мужчин.*?)\]', sents[3].lower()))
    if not rus.issubset({'[человек]', '[женщина]', '[мужчина]'}) or not rus:
        continue
    sl = set(re.findall(r'\[(?:človek.*?|žensk.*?|mož.*?|mošk.*?)\]', sents[5].lower()))
    if not sl.issubset({'[človek]', '[ženska]', '[mož]', '[moški]'}) or not sl:
        continue
    cs = set(re.findall(r'\[(?:člověk.*?|žen.*?|muž.*?)\]', sents[6].lower()))
    if not cs.issubset({'[člověk]', '[žena]', '[muž]'}) or not cs:
        continue
    resclean.append(sents)

In [11]:
for i in range(len(resclean)):
    resclean[i] = list(resclean[i])
    resclean[i][0] = re.sub(r'(?i)\[(?:człowiek|mężczyzna)\]', '[male]', resclean[i][0])
    resclean[i][0] = re.sub(r'(?i)\[kobieta\]', '[female]', resclean[i][0])
    resclean[i][1] = re.sub(r'(?i)\[(?:чалавек|мужчына)\]', '[male]', resclean[i][1])
    resclean[i][1] = re.sub(r'(?i)\[жанчына\]', '[female]', resclean[i][1])
    resclean[i][2] = re.sub(r'(?i)\[(?:людина|чоловік)\]', '[male]', resclean[i][2])
    resclean[i][2] = re.sub(r'(?i)\[жінка\]', '[female]', resclean[i][2])
    resclean[i][3] = re.sub(r'(?i)\[(?:человек|мужчина)\]', '[male]', resclean[i][3])
    resclean[i][3] = re.sub(r'(?i)\[женщина\]', '[female]', resclean[i][3])
    resclean[i][4] = re.sub(r'\[(?:човек.*?|мъж.*?)\]', '[male]', resclean[i][4])
    resclean[i][4] = re.sub(r'\[жена.*?\]', '[female]', resclean[i][4])
    resclean[i][5] = re.sub(r'(?i)\[(?:človek|mož|moški)\]', '[male]', resclean[i][5])
    resclean[i][5] = re.sub(r'(?i)\[ženska\]', '[female]', resclean[i][5])
    resclean[i][6] = re.sub(r'(?i)\[(?:člověk|muž)\]', '[male]', resclean[i][6])
    resclean[i][6] = re.sub(r'(?i)\[žena\]', '[female]', resclean[i][6])


In [13]:
for i in range(len(resclean)):
    resclean[i][7] = resclean[i][7].replace('[man]', '[male]')
    resclean[i][7] = resclean[i][7].replace('[woman]', '[female]')

In [14]:
df = pd.DataFrame(resclean, columns=['pl', 'be', 'uk', 'ru', 'bg', 'sl', 'cs', 'en'])
df.to_csv(f'{HOMEPATH}PER_sents_en.csv', index=False)

Locations

In [9]:
'''LOCATIVE'''
cleansents = []
for i in range(len(data)):
    labels = Counter((t[2] for t in data[i]))
    if len(labels) == 2 and labels['I-LOC'] == 1:
        for j in range(len(data[i])):
            try:
                if data[i][j][0] == 'in' and data[i][j + 1][2] == 'I-LOC':
                    # data[i][j][0] == ''
                    data[i][j + 1][0] += ' |0|'
                    cleansents.append(data[i])
            except IndexError:
                continue

print(len(cleansents))

1897


In [10]:
'''NOMINATIVE'''
# cleansents = []
for i in range(len(data)):
    labels = Counter((t[2] for t in data[i]))
    if len(labels) == 2 and labels['I-LOC'] == 1:
        for j in range(len(data[i])):
            try:
                if (data[i][j][1] in {'VBZ', 'VBP', 'VBN'} and data[i][j + 1][2] == 'I-LOC'):
                    data[i][j + 1][0] += ' |0|'
                    cleansents.append(data[i])
                elif (data[i][j][2] == 'I-LOC' and data[i][j + 1][1]  in {'VBZ', 'VBP', 'VBN'}):
                    data[i][j][0] += ' |0|'
                    cleansents.append(data[i])
            except IndexError:
                continue

print(len(cleansents))

4350


In [11]:
locres = sentmerger(cleansents)

In [31]:
'''Only for nominative'''
locsents = []
for sent in locres:
    if 'from the [???]' in sent or 'of the [???]' in sent or 'in [???]' in sent or 'from [???]' in sent or 'of [???]' in sent:
        continue
    locsents.append(sent)

In [32]:
len(locsents)

1850

In [33]:
with open(f'{HOMEPATH}locwikisentsNOM.txt', 'w', encoding='utf8') as file:
    for sent in locsents:
        print(sent, file=file)

In [22]:
with open(f'{HOMEPATH}locwikisentsNOM.txt', encoding='utf8') as file:
    locsents = [l.rstrip() for l in file.readlines()]

In [23]:
locdata = pd.read_csv(f'{HOMEPATH}1850_nomlocsents.csv')

In [27]:
locdata.drop('Unnamed: 0', axis=1, inplace=True)

In [28]:
locdata.to_csv(f'{HOMEPATH}LOC_sents_nom_en.csv', index=False)

In [8]:
'''Only for locative'''
for i in range(len(locsents)):
    locsents[i] = locsents[i].replace('[loc]', '[in ???]')

In [9]:
locsents[:2]

['The majority of common wet-cured ham available [in ???] supermarkets is of the " city ham " variety, [ citation needed ] in which brine is injected into the meat for a very rapid curing suitable for mass market.',
 'This dubbing has gathered a cult following [in ???] for that precise reason, although many anime fans consider it highly disrespectful to the original work.']

In [None]:
locdataset = pack_translate(locres, 'ZEROLOC')

In [14]:
locdataset = locdataset.apply(lambda x: x.replace('|0| |0|', '|0|'))

In [16]:
pickle.dump(locdataset, open(f'{HOMEPATH}ZEROLOC_4350sents', 'wb'))

Organisations

In [26]:
orgsents = []
for i in range(len(data)):
    labels = Counter((t[2] for t in data[i]))
    if len(labels) == 2 and labels['I-ORG'] <= 2:
        for j in range(len(data[i])):
            if data[i][j][2] == 'I-ORG':
                if j - 1 >= 0:
                    if data[i][j - 1][0] == 'is':
                        orgsents.append(data[i])
                        break
                if j + 1 < len(data[i]):
                    if data[i][j + 1][1] in {'VBZ', 'VBP', 'VBN'}:
                        orgsents.append(data[i])
                        break

print(len(orgsents))

1213


In [27]:
for i in range(len(orgsents)):
    sent = ''
    for token in orgsents[i]:
        if token[2] == 'I-ORG':
            # if not sent.endswith('|0|'):
            sent += ' ' + token[0] + ' |0| '
        else:
            if token[0] in '.,!?;:%&*)':
                sent += token[0]
            else:
                sent += f' {token[0]}'
    orgsents[i] = sent

orgsents[:15]

[' Summerhill |0|  is often cited as an example of anarchism in practice.',
 ' Studies by the Hadley |0|  Centre |0|  have investigated the relative ( generally warming) effect of albedo change and ( cooling) effect of carbon sequestration on planting forests.',
 ' Democrats |0|  are still the majority party in both houses of the legislature.',
 ' Sanmina-SCI |0|  has a large presence in the area.',
 ' Eight days later, the Council |0|  convenes once more.',
 ' The General Council |0|  is also responsible for proposing and passing laws.',
 ' The Supreme |0|  Court |0|  hears civil appeals and may in its discretion hear criminal appeals.',
 " The Anchorage |0|  Opera |0|  is currently the state 's only professional opera company, though there are several volunteer and semi-professional organizations in the state as well.",
 ' While there has been some research on sustainability using GMO crops, at least one hyped and prominent multi-year attempt by Monsanto |0|  Company |0|  has been un

In [14]:
ressents = []
for sent in orgsents:
    if 'of the [???]' in sent or 'by [???]' in sent or 'of [???]' in sent or 'in the [???]' in sent or 'in [???]' in sent:
        continue
    else:
        ressents.append(sent)
len(ressents)

1095

In [15]:
for i in range(len(ressents)):
    if ' are ' in ressents[i] or ' their ' in ressents[i]:
        ressents[i] = ressents[i].replace('[???]', '[companies]')
    elif ' party ' in ressents[i]:
        ressents[i] = ressents[i].replace('[???]', '[company]')
    elif i % 3:
        ressents[i] = ressents[i].replace('[???]', '[club]')
    else:
        ressents[i] = ressents[i].replace('[???]', '[society]')

In [16]:
ressents[:15]

[' [society] is often cited as an example of anarchism in practice.',
 ' Studies by the [club] have investigated the relative ( generally warming) effect of albedo change and ( cooling) effect of carbon sequestration on planting forests.',
 ' [companies] are still the majority party in both houses of the legislature.',
 ' [society] has a large presence in the area.',
 ' Eight days later, the [club] convenes once more.',
 ' The General [club] is also responsible for proposing and passing laws.',
 ' The [society] hears civil appeals and may in its discretion hear criminal appeals.',
 " The [companies] is currently the state 's only professional opera company, though there are several volunteer and semi-professional organizations in the state as well.",
 ' [companies] accredits standards that are developed by representatives of standards developing organizations, government agencies, consumer groups, companies, and others.',
 ' [society] estimates that more than 80 percent of females and 

In [None]:
orgsents = pack_translate(orgsents, 'ZEROORG')

In [None]:
orgsents.to_csv(f'{HOMEPATH}ORG_newsents_{len(orgsents)}.csv', index=False)

Translation of org sents

In [29]:
with open(f'{HOMEPATH}!ORGS.txt', encoding='utf8') as file:
    orgres = [s.strip() for s in file.readlines()]

orgres[:3]

['[???] is often cited as an example of anarchism in practice.',
 '[???] has a large presence in the area.',
 'The [???] is also responsible for proposing and passing laws.']

In [None]:
plsents = translating(orgsents, 'pl')
rusents = translating(orgsents, 'ru')
besents = translating(orgsents, 'be')
uksents = translating(orgsents, 'uk')
bgsents = translating(orgsents, 'bg')
slsents = translating(orgsents, 'sl')
cssents = translating(orgsents, 'cs')

In [116]:
writesents(plsents, 'pl', 'ORG')
writesents(rusents, 'ru', 'ORG')
writesents(besents, 'be', 'ORG')
writesents(uksents, 'uk', 'ORG')
writesents(bgsents, 'bg', 'ORG')
writesents(slsents, 'sl', 'ORG')
writesents(cssents, 'cs', 'ORG')

In [117]:
uksents[-5:]

[' [???] є найбільшим у світі розробником стандартів.',
 ' Другим найбільшим постачальником є \u200b\u200b[???].',
 ' Іммігранти розходяться за своїми політичними поглядами; однак [???] вважається набагато сильніше серед іммігрантів загалом.',
 ' Сьогодні [???] є провідною міжнародною організацією у світі в своїй галузі, і її стандарти прийняті як національні стандарти її членами.',
 ' Кожен член [???] обирається терміном на вісім років і може бути переобраний на один або кілька наступних термінів.']

In [34]:
with open(f'{HOMEPATH}ORGS_from_wiki.txt', encoding='utf8') as file:
    orgs = [t.rstrip().split('\t') for t in file.readlines()]

In [29]:
orgs = pd.read_csv(f'{HOMEPATH}ORG_239.csv')

In [30]:
rusents = loadsents('ru', 'ORG')
plsents = loadsents('pl', 'ORG')
uksents = loadsents('uk', 'ORG')
besents = loadsents('be', 'ORG')
bgsents = loadsents('bg', 'ORG')
slsents = loadsents('sl', 'ORG')
cssents = loadsents('cs', 'ORG')

In [36]:
c = 0
for i in range(len(rusents)):
    if c >= len(orgs):
        c = 0
    plsents[i] = plsents[i].replace('[???]', f'|{orgs[c][0]}|')
    besents[i] = besents[i].replace('[???]', f'|{orgs[c][1]}|')
    uksents[i] = uksents[i].replace('[???]', f'|{orgs[c][2]}|')
    rusents[i] = rusents[i].replace('[???]', f'|{orgs[c][3]}|')
    c += 1

In [39]:
len(orgsents)

348

In [40]:
orgdata = list(zip(plsents, besents, uksents, rusents, bgsents, slsents, cssents, orgsents))
print(orgdata[:4])

[('[???] jest często przytaczany jako przykład anarchizmu w praktyce.', '[???] часта прыводзіцца ў якасці прыкладу анархізму на практыцы.', '[???] часто наводять як приклад анархізму на практиці.', '[???] часто приводят как пример анархизма на практике.', '[???] често се цитира като пример за анархизъм на практика.', '[???] se pogosto navaja kot primer anarhizma v praksi.', '[???] je často uváděn jako příklad anarchismu v praxi.', ' [???] is often cited as an example of anarchism in practice.'), (' Generał [???] jest również odpowiedzialny za proponowanie i uchwalanie ustaw.', ' Генерал [???] таксама адказвае за прапановы і прыняцце законаў.', ' Генерал [???] також відповідає за пропозицію та ухвалення законів.', ' Генерал [???] также отвечает за предложение и принятие законов.', ' Генералът [???] също отговаря за предлагането и приемането на закони.', ' General [???] je pristojen tudi za predlaganje in sprejemanje zakonov.', ' Generál [???] je také zodpovědný za navrhování a přijímání

In [48]:
for i in range(len(orgdata)):
    c = randint(0, len(orgs) - 1)
    orgdata[i] = list(orgdata[i])
    for j in range(7):
        orgdata[i][j] = orgdata[i][j].replace('[???]', f'|{orgs.iloc[c][j]}|')

In [49]:
print(orgdata[:4])

[['|Amnesty International| jest często przytaczany jako przykład anarchizmu w praktyce.', '|Міжнародная амністыя| часта прыводзіцца ў якасці прыкладу анархізму на практыцы.', '|Amnesty International| часто наводять як приклад анархізму на практиці.', '|Amnesty International| часто приводят как пример анархизма на практике.', '|Амнести Интернешънъл| често се цитира като пример за анархизъм на практика.', '|Amnesty International| se pogosto navaja kot primer anarhizma v praksi.', '|Amnesty International| je často uváděn jako příklad anarchismu v praxi.'], [' Generał |Greenpeace| jest również odpowiedzialny za proponowanie i uchwalanie ustaw.', ' Генерал |Greenpeace| таксама адказвае за прапановы і прыняцце законаў.', ' Генерал |Грінпіс| також відповідає за пропозицію та ухвалення законів.', ' Генерал |Гринпис| также отвечает за предложение и принятие законов.', ' Генералът |Грийнпийс| също отговаря за предлагането и приемането на закони.', ' General |Greenpeace| je pristojen tudi za pred

In [50]:
len(orgdata)

348

In [42]:
orgtable = pd.DataFrame(orgdata, columns=['pl', 'be', 'uk', 'ru', 'bg', 'sl', 'cs', 'en'])
orgtable.to_csv(f'{HOMEPATH}ORG_sents_en.csv', index=False)

In [40]:
locs = pd.read_csv(f'{HOMEPATH}LOC_2000.csv')

In [41]:
locs.head()

Unnamed: 0.1,Unnamed: 0,pl,be,uk,ru,bg,sl,cs
0,0,Indie,Індыя,Індія,Индия,Индия,Indija,Indie
1,1,Hiszpania,Іспанія,Іспанія,Испания,Испания,Španija,Španělsko
2,2,Włochy,Італія,Італія,Италия,Италия,Italija,Itálie
3,3,Nur-Sułtan,Акмала,Нур-Султан,Нур-Султан,Нур Султан,Nursultan,Nur-Sultan
4,4,Aleksandria,Аляксандрыя,Александрія,Александрия,Александрия,Aleksandrija,Alexandrie


In [42]:
locs.drop('Unnamed: 0', axis=1, inplace=True)

In [43]:
for i, row in locs.iterrows():
    if 'Горад' in locs.loc[i, 'be']:
        locs.loc[i, 'be'] = locs.loc[i, 'be'].replace('Горад ', '')

In [52]:
for column in locs.columns:
    locs[column] = locs[column].apply(lambda x: re.sub(r' \(.+?\)', '', x))

In [53]:
locs.sample(10)

Unnamed: 0,pl,be,uk,ru,bg,sl,cs
2001,Opole,Опельн,Ополе,Ополе,Ополе,Opole,Opolí
1889,Bristol,Брысталь,Бристоль,Бристоль,Бристъл,Bristol,Bristol
476,Uetersen,Uetersen,Ютерзен,Итерзен,Ютерзен,Uetersen,Uetersen
1656,Hiszpania,Каралеўства Іспанія,Іспанія,Испания,Испания,Španija,Španělsko
1884,Słowacja,Славацкая рэспубліка,Словаччина,Словакия,Словакия,Slovaška,Slovensko
1535,Wagharszapat,Вагаршапат,Вагаршапат,Вагаршапат,Вагаршапат,Ečmiadzin,Vagharšapat
30,Korea Północna,КНДР,Північна Корея,Корейская Народно-Демократическая Республика,Северна Корея,Severna Koreja,Severní Korea
233,Nukuʻalofa,Нукуалофа,Нукуалофа,Нукуалофа,Нукуалофа,Nukuʻalofa,Nuku'alofa
121,Bissau,Бісау,Бісау,Бисау,Бисау,Bissau,Bissau
1347,Bamako,Бамака,Бамако,Бамако,Бамако,Bamako,Bamako


In [57]:
locs.loc[262]

pl           Ho Chi Minh
be                Сайгон
uk               Хошимін
ru               Хошимин
bg               Хошимин
sl              Hošiminh
cs    Ho Či Minovo Město
Name: 262, dtype: object

In [60]:
locs.drop_duplicates(subset=['ru'], keep='first', inplace=True)

In [61]:
len(locs)

1263

In [63]:
pickle.dump(locs, open(f'{HOMEPATH}1263_locs', 'wb'))

In [62]:
locs.to_csv(f'{HOMEPATH}LOC_1263.csv')