## Entity separation

This notebook takes the CSV file from the previous step and separates the data into three 
CSV-s: one for singletons (entities with just one word), one for the first words
(all but the last word of multiword entities) and one for the last words (last
word of multiword entities)

In [17]:
import pandas as pd
ner_counts = pd.read_csv('ner_counts.csv')

In [18]:
ner_counts

Unnamed: 0.1,Unnamed: 0,entity,label,count
0,0,"('Ühendriigid',)",LOC,1
1,1,"('Leedusse',)",LOC,1
2,2,"('Leedu', 'välisministeeriumi')",ORG,1
3,3,"('Interfaxile',)",ORG,1
4,4,"('Bill', 'Clinton')",PER,2
...,...,...,...,...
4126,4126,"('Külli', 'Hansen', 'Sebra')",PER,1
4127,4127,"('Fancy',)",LOC,1
4128,4128,"('Fun', '&', 'Fancy')",ORG,4
4129,4129,"('Hinnad', 'Sebra', 'galeriist')",ORG,1


Separate the entites to three

In [45]:
singletons_map = {}
last_word_map = {}
first_words_map = {}

import ast 

for row in ner_counts.itertuples():
    entity = ast.literal_eval(row.entity)
    if len(entity) == 1:
        curr_map = singletons_map.get(entity,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = row.count + curr_count
        singletons_map[entity] = curr_map
    else:
        last_word = entity[-1]
        curr_map = last_word_map.get(last_word,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = curr_count + row.count
        last_word_map[last_word] = curr_map
        
        first_words = entity[:-1]
        curr_map = first_words_map.get(first_words,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = row.count + curr_count
        first_words_map[first_words] = curr_map
        


In [46]:
singletons_map

{('Ühendriigid',): {'LOC': 1},
 ('Leedusse',): {'LOC': 1},
 ('Interfaxile',): {'ORG': 1},
 ('Leedu',): {'LOC': 49},
 ('Leedus',): {'LOC': 19},
 ('USA',): {'LOC': 53},
 ('Lääne-Euroopa',): {'LOC': 1},
 ('Jaapanil',): {'LOC': 1},
 ('Kanadal',): {'LOC': 1},
 ('Austraalial',): {'LOC': 1},
 ('Uus-Meremaal',): {'LOC': 1},
 ('Ungaril',): {'LOC': 1},
 ('Poolal',): {'LOC': 1},
 ('Tšehhil',): {'LOC': 1},
 ('OPEC',): {'ORG': 2},
 ('Londonis',): {'LOC': 4},
 ('Riyadh',): {'PER': 1},
 ('Viinis',): {'LOC': 1},
 ('Ühispanga',): {'ORG': 9},
 ('Norma',): {'ORG': 8},
 ('Tallinna',): {'LOC': 67},
 ('Mõis',): {'PER': 6},
 ('BNS-ile',): {'ORG': 14},
 ('Mõisa',): {'PER': 6},
 ('Voog',): {'PER': 2},
 ('Eesti',): {'LOC': 189},
 ('Soome',): {'LOC': 38},
 ('Kauppalehti',): {'ORG': 1},
 ('Soomet',): {'LOC': 2},
 ('Venemaa',): {'LOC': 42},
 ('Aiwa',): {'ORG': 1},
 ('Electrolux',): {'LOC': 1},
 ('Peterburgi',): {'LOC': 1},
 ('Moskvasse',): {'LOC': 4},
 ('EL',): {'ORG': 9},
 ('Vene',): {'LOC': 39},
 ('Soomes',): {'

In [47]:
first_words_map

{('Leedu',): {'ORG': 6},
 ('Bill',): {'PER': 7},
 ('Saudi',): {'LOC': 3},
 ('Aivo',): {'PER': 8},
 ('Eesti',): {'ORG': 68, 'LOC': 10},
 ('Jüri',): {'PER': 35},
 ('Rein',): {'PER': 12},
 ('Tallinna',): {'ORG': 55, 'LOC': 8},
 ('Euroopa',): {'ORG': 24, 'LOC': 17},
 ('Läti',): {'ORG': 5},
 ('Ivar',): {'PER': 1},
 ('ASi', 'Lõhmus', 'Haavel', '&'): {'ORG': 1},
 ('Heiki',): {'PER': 9},
 ('Trigon',): {'ORG': 1},
 ('Mihkel',): {'PER': 5},
 ('Q',): {'ORG': 10},
 ('Ameerika',): {'LOC': 6, 'ORG': 1},
 ('Jaana',): {'PER': 1},
 ('Märten',): {'PER': 1},
 ('Andres',): {'PER': 21},
 ('ASis', 'Repo'): {'ORG': 1},
 ('Repo',): {'ORG': 1, 'PER': 1},
 ('Aivar',): {'PER': 8},
 ('Balti',): {'LOC': 6, 'ORG': 1},
 ('Karin',): {'PER': 7},
 ('Kristjan-Erik',): {'PER': 3},
 ('Eesti', 'Väärtpaberite'): {'ORG': 1},
 ('Tartu',): {'ORG': 15, 'LOC': 2},
 ('Vambola',): {'PER': 2},
 ('Üllar',): {'PER': 3},
 ('Kolbakov', 'Üllar'): {'PER': 1},
 ('Lõuna-Eesti',): {'ORG': 3},
 ('Eha',): {'PER': 2},
 ('ASi',): {'ORG': 2},
 (

In [60]:
# making first words mapping keys strings to allow convenient dataframe creation
first_words_map = {str(k):v for k,v in first_words_map.items()}

In [44]:
last_word_map

{'välisministeeriumi': {'ORG': 3},
 'Clinton': {'PER': 3},
 'Araabia': {'LOC': 3},
 'Kanguse': {'PER': 1},
 'Telekomi': {'ORG': 5},
 'Mõisa': {'PER': 3},
 'Voogi': {'PER': 1},
 'Vee': {'ORG': 2},
 'Liidu': {'ORG': 15, 'LOC': 18},
 'statistikaamet': {'ORG': 1},
 'Soojuse': {'ORG': 4},
 'Virkus': {'PER': 1},
 'Viisemann': {'ORG': 1},
 'Kivimaa': {'PER': 2},
 'Kapitali': {'ORG': 3},
 'Pärnoja': {'PER': 2},
 "GSM'il": {'ORG': 1},
 'GSMi': {'ORG': 7},
 'Ühendriikidesse': {'LOC': 1},
 'Adusoni': {'PER': 1},
 'Panga': {'ORG': 11},
 'Ross': {'PER': 1},
 'Saarniit': {'PER': 2},
 'Keskpank': {'ORG': 1},
 'Vabrikud': {'ORG': 1},
 'Vabrikute': {'ORG': 1},
 'Karon': {'PER': 1},
 'riikides': {'LOC': 4},
 'Kruuda': {'PER': 1},
 'Vabrikutes': {'PER': 1},
 'Suurväli': {'PER': 2},
 'börsi': {'ORG': 1},
 'Keskdepositooriumiga': {'ORG': 1},
 'maakohus': {'ORG': 1},
 'Kolbakovile': {'PER': 1},
 'Meoselt': {'PER': 1},
 'Meose': {'PER': 1},
 'Arengupangale': {'ORG': 1},
 'Arengupanga': {'ORG': 2},
 'Mõttusel

creating three dataframes, filling missing values with 0

In [54]:
single_csv = pd.DataFrame.from_dict(singletons_map,orient='index')
single_csv = single_csv.fillna(0)
single_csv

Unnamed: 0,LOC,ORG,PER
Ühendriigid,1.0,0.0,0.0
Leedusse,1.0,0.0,0.0
Leedu,49.0,0.0,0.0
Leedus,19.0,0.0,0.0
USA,53.0,0.0,0.0
...,...,...,...
Elna,0.0,0.0,1.0
Puiskestvikku,0.0,0.0,1.0
Schwarzkopf,0.0,0.0,1.0
Pott,0.0,0.0,1.0


In [53]:
last_df = pd.DataFrame.from_dict(last_word_map,orient='index')
last_df = last_df.fillna(0)
last_df

Unnamed: 0,ORG,PER,LOC
välisministeeriumi,3.0,0.0,0.0
Telekomi,5.0,0.0,0.0
Vee,2.0,0.0,0.0
Liidu,15.0,0.0,18.0
statistikaamet,1.0,0.0,0.0
...,...,...,...
meri,0.0,0.0,4.0
vabariigi,0.0,0.0,1.0
Tammsaare,0.0,0.0,1.0
Liberty,0.0,0.0,1.0


In [62]:
first_df = pd.DataFrame.from_dict(first_words_map,orient='index')
first_df

Unnamed: 0,ORG,PER,LOC
"('Leedu',)",6.0,,
"('Eesti',)",68.0,,10.0
"('Tallinna',)",55.0,,8.0
"('Euroopa',)",24.0,,17.0
"('Läti',)",5.0,,
...,...,...,...
"('Aadria',)",,,2.0
"('Jugoslaavia',)",,,1.0
"('Utah', ""'"")",,,1.0
"('EchoStar', ',')",,,1.0


Saving results to CSV files

In [58]:
last_df.to_csv('data/last_counts.csv')

In [59]:
single_csv.to_csv('data/single_counts.csv')

In [63]:
first_df.to_csv('data/first_counts.csv')