## Entity separation

This notebook takes the CSV file from the previous step and separates the data into three 
CSV-s: one for singletons (entities with just one word), one for the first words
(all but the last word of multiword entities) and one for the last words (last
word of multiword entities)

In [14]:
import pandas as pd
ner_counts = pd.read_csv('ner_counts_5000_lemmas.csv')

In [15]:
ner_counts[:10]

Unnamed: 0.1,Unnamed: 0,entity,label,entity_count
0,0,"('Aive', 'Lauriste')",PER,2
1,1,"('Eminem',)",PER,19
2,2,"('Mathers', 'III')",PER,1
3,3,"('Slim', 'Shady')",PER,1
4,4,"('People',)",ORG,1
5,5,"('Hailie',)",PER,2
6,6,"('Mile',)",LOC,1
7,7,"('Jimmy', 'Smith')",PER,1
8,8,"('Detroit',)",LOC,14
9,9,"('Jimmy',)",PER,7


Separate the entites to three

In [16]:
singletons_map = {}
last_word_map = {}
first_words_map = {}

import ast 

for row in ner_counts.itertuples():
    entity = ast.literal_eval(row.entity)
    if len(entity) == 1:
        curr_map = singletons_map.get(entity,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = row.entity_count + curr_count
        singletons_map[entity] = curr_map
    else:
        last_word = entity[-1]
        curr_map = last_word_map.get(last_word,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = curr_count + row.entity_count
        last_word_map[last_word] = curr_map
        
        first_words = entity[:-1]
        curr_map = first_words_map.get(first_words,{})
        curr_count = curr_map.get(row.label,0) 
        curr_map[row.label] = row.entity_count + curr_count
        first_words_map[first_words] = curr_map
        


In [17]:
singletons_map

{('Eminem',): {'PER': 19},
 ('People',): {'ORG': 1, 'PER': 2},
 ('Hailie',): {'PER': 2},
 ('Mile',): {'LOC': 1},
 ('Detroit',): {'LOC': 14, 'ORG': 4},
 ('Jimmy',): {'PER': 7},
 ('Ameerika',): {'LOC': 294},
 ('Jimmyl',): {'PER': 1},
 ('Grazer',): {'PER': 1},
 ('Hanson',): {'PER': 13, 'ORG': 1},
 ('Hollywood',): {'LOC': 39},
 ('STING',): {'ORG': 1, 'PER': 1},
 ('Sting',): {'PER': 32},
 ('Trudie',): {'PER': 16},
 ('Lock',): {'PER': 1},
 ('Ritchiele',): {'PER': 1},
 ('Coco',): {'PER': 1},
 ('Giacomo',): {'PER': 1},
 ('Mickey',): {'PER': 1},
 ('Jake',): {'PER': 1},
 ('Worcestershire',): {'LOC': 1},
 ('Stingi-Tomelty',): {'LOC': 1},
 ('Styleri-Tomelty',): {'LOC': 1},
 ('Paul',): {'PER': 7},
 ('Stella',): {'PER': 1},
 ('Stingi',): {'PER': 3},
 ('Versace',): {'PER': 1},
 ('Universal',): {'ORG': 2, 'PER': 1},
 ('Evelin',): {'PER': 30},
 ('Alternature',): {'PER': 3},
 ('Pajusaar',): {'PER': 3},
 ('Euroopa',): {'LOC': 920},
 ('EBS',): {'ORG': 4, 'LOC': 1},
 ('Universa',): {'ORG': 1},
 ('Eesti',):

In [18]:
first_words_map

{('Aive',): {'PER': 6},
 ('Mathers',): {'PER': 1},
 ('Slim',): {'PER': 1},
 ('Jimmy',): {'PER': 7},
 ('Kim',): {'PER': 16},
 ('Brittany',): {'PER': 1},
 ('Brian',): {'PER': 9},
 ('Curtis',): {'PER': 1},
 ('LA',): {'ORG': 13},
 ('Wonder',): {'PER': 1},
 ('New',): {'LOC': 25, 'ORG': 91, 'PER': 3},
 ('Trudie',): {'PER': 2},
 ('vihmamets',): {'ORG': 1},
 ('Xingu',): {'PER': 1},
 ('Guy',): {'PER': 4},
 ('Stock', '&', 'Two', 'Smoking'): {'ORG': 1},
 ('Bristoli', 'Old', 'Vic'): {'ORG': 1},
 ('West',): {'PER': 4, 'ORG': 7},
 ('Frances',): {'PER': 4},
 ('Linda',): {'PER': 12, 'ORG': 4},
 ('Sven',): {'PER': 47},
 ('Evelin',): {'PER': 19},
 ('Priit',): {'PER': 169},
 ('Over', 'The', 'Water'): {'ORG': 1},
 ('Glen',): {'PER': 4},
 ('Koit',): {'PER': 85, 'ORG': 2},
 ('Kati',): {'PER': 12},
 ('Kaire-Külli',): {'PER': 1},
 ('AS',): {'ORG': 189},
 ('Jaan',): {'PER': 271, 'ORG': 2},
 ('Rahu',): {'PER': 1},
 ('Aino',): {'PER': 20},
 ('Tartu', 'Maarja'): {'ORG': 2},
 ('Rocca', 'al'): {'LOC': 18},
 ('Tooma

In [19]:
# making first words mapping keys strings to allow convenient dataframe creation
first_words_map = {str(k):v for k,v in first_words_map.items()}

In [20]:
last_word_map

{'Lauriste': {'PER': 2},
 'III': {'PER': 10, 'ORG': 3},
 'Shady': {'PER': 1},
 'Smith': {'PER': 14},
 'Basinger': {'PER': 3},
 'Murphy': {'PER': 1},
 'Grazer': {'PER': 1},
 'Hanson': {'PER': 24},
 'Confidential': {'ORG': 2},
 'Boys': {'PER': 4},
 'York': {'LOC': 21, 'ORG': 84, 'PER': 1},
 'Styler': {'PER': 2},
 'Fond': {'ORG': 11, 'PER': 2, 'LOC': 1},
 'Films': {'PER': 2},
 'Ritchie': {'PER': 1},
 'Barrels': {'ORG': 1},
 'teatrikool': {'ORG': 2},
 'ise': {'PER': 4},
 'Tomelty': {'PER': 1},
 'McCartneyst': {'PER': 1},
 'Haljand': {'PER': 3},
 'Samuel': {'PER': 2},
 'Jeruusalemm': {'PER': 1},
 'Pajusaar': {'PER': 1},
 'Blue': {'ORG': 2, 'PER': 1},
 'Pilvre': {'PER': 5},
 'Toome': {'PER': 18},
 'Murutar': {'PER': 5},
 'Kuldbek': {'PER': 1},
 'Sangar': {'ORG': 1},
 'Kallas': {'PER': 106},
 'kool': {'ORG': 53, 'PER': 1},
 'Mare': {'LOC': 19},
 'Paul': {'PER': 6},
 'Eenmaa': {'PER': 21},
 'maripuu': {'PER': 5},
 'Rahumägi': {'PER': 5},
 'Randpere': {'PER': 1},
 'Gräzin': {'PER': 6},
 'Maripu

creating three dataframes, filling missing values with 0

In [21]:
single_csv = pd.DataFrame.from_dict(singletons_map,orient='index')
single_csv = single_csv.fillna(0)
single_csv

Unnamed: 0,PER,ORG,LOC
Eminem,19.0,0.0,0.0
People,2.0,1.0,0.0
Hailie,2.0,0.0,0.0
Jimmy,7.0,0.0,0.0
Jimmyl,1.0,0.0,0.0
...,...,...,...
RSbeta@xxxxx.ee,0.0,0.0,6.0
MinNike-vs-mimmi-vs-Liisk,0.0,0.0,2.0
LoLL@xxxxx.ee,0.0,0.0,1.0
Syru,0.0,0.0,1.0


In [22]:
last_df = pd.DataFrame.from_dict(last_word_map,orient='index')
last_df = last_df.fillna(0)
last_df

Unnamed: 0,PER,ORG,LOC
Lauriste,2.0,0.0,0.0
III,10.0,3.0,0.0
Shady,1.0,0.0,0.0
Smith,14.0,0.0,0.0
Basinger,3.0,0.0,0.0
...,...,...,...
eiiiii,0.0,0.0,1.0
Syru,0.0,0.0,1.0
nu,0.0,0.0,1.0
HannuLa,0.0,0.0,1.0


In [23]:
first_df = pd.DataFrame.from_dict(first_words_map,orient='index')
first_df

Unnamed: 0,PER,ORG,LOC
"('Aive',)",6.0,,
"('Mathers',)",1.0,,
"('Slim',)",1.0,,
"('Jimmy',)",7.0,,
"('Kim',)",16.0,,
...,...,...,...
"('Eoikjiiiiiiiiiio',)",,,1.0
"('Pioneereml',)",,,1.0
"('hihii',)",,,1.0
"('Freddy-Uni', ',')",,,1.0


Saving results to CSV files

In [28]:
last_df.to_csv('data/last_counts_lemma_5000.csv')

In [29]:
single_csv.to_csv('data/single_counts_lemma_5000.csv')

In [30]:
first_df.to_csv('data/first_counts_lemma_5000.csv')