<small><i>This notebook was put together by [Roman Prokofyev](http://prokofyev.ch)@[eXascale Infolab](http://exascale.info/). Source and license info is on [GitHub](https://github.com/dragoon/kilogram/).</i></small>

# Prerequisites

* Pandas: ``pip install pandas``
* Matplotlib

In [1]:
import matplotlib.pyplot as plt
from mpltools import style
import numpy as np
style.use('ggplot')
%matplotlib inline
import pandas as pd
import shelve
from collections import defaultdict

# Construct original counts file

In [2]:
count_dict = {}
for line in open('../mapreduce/predicted_label_counts.txt'):
    uri, label, values = line.split('\t')
    upper_count, lower_count = values.split(',')
    count_dict[(uri, label)] = {'infer_normal': int(upper_count), 'infer_lower': int(lower_count), 'len': len(label.split('_')),
                       'label': label, 'organ_normal': 0, 'organ_lower': 0, 'uri': uri}
for line in open('../mapreduce/organic_label_counts.txt'):
    uri, label, values = line.split('\t')
    if (uri, label) in count_dict:
        upper_count, lower_count = values.split(',')
        count_dict[(uri, label)].update({'organ_normal': int(upper_count), 'organ_lower': int(lower_count)})
counts_df = pd.DataFrame(count_dict.values())
del count_dict
counts_df.head()

Unnamed: 0,infer_lower,infer_normal,label,len,organ_lower,organ_normal,uri
0,0,12,Toni Negri,1,0,13,Antonio_Negri
1,0,38,Mike Groff,1,0,40,Mike_Groff
2,0,19,Zigzag River,1,0,7,Zigzag_River
3,0,1,St Francis Rangers,1,0,1,St_Francis_Rangers_F.C.
4,0,6,Semiha Yankı,1,0,5,Semiha_Yankı


# Generate excludes by ambiguity

In [22]:
from __future__ import division
"""
We never exclude uppercase labels since we don't match at the beginning of a sentence
"""
includes = open('../mapreduce/unambiguous_labels.txt', 'w')
for row in counts_df.iterrows():
    row = row[1]
    exclude = False
    label = row['label']
    uri = row['uri']
    
    # skip uppercase
    if label.isupper():
        includes.write(label+'\t'+uri+'\n')
        continue
    # if label appears only in lowercase - add to lower includes
    if row['organ_normal'] == 0:  # means label is lowercase
        if row['organ_lower'] > 1:
            includes.write(label+'\t'+uri+'\n')
        continue
    else:
        infer_ratio = row['infer_normal']/(row['infer_lower'] or 1)
        orig_ratio = row['organ_normal']/(row['organ_lower'] or 1)
        if infer_ratio == 0:
            # weird label, p. ex. 中华人民共和国
            continue
        # always write a normal-case label
        includes.write(label+'\t'+uri+'\n')
        if orig_ratio/infer_ratio < 2 and row['infer_lower'] > 0:
            includes.write(label.lower()+'\t'+uri+'\n')
includes.close()        

# Generate typed n-grams
    
    hdfs dfs -cat /user/roman/wikipedia_ngrams/* | python spark_typed_ngrams_from_plain.py > typed_ngrams.txt
    hdfs dfs -put typed_ngrams.txt /user/roman/wikipedia_typed_ngrams/
    
### Hbase-suitable format:

    ./run_job.py -m ./type_prediction/mapper.py -r ./type_prediction/reducer.py "/user/roman/wikipedia_typed_ngrams" /user/roman/hbase_wikipedia_typed_ngrams
    
### Put into Hbase:
    
    pig -p table=typogram -p path=/user/roman/hbase_wikipedia_typed_ngrams ../extra/hbase_upload_array.pig

In [3]:
counts_df[(counts_df.uri == 'Cicada')]

Unnamed: 0,infer_lower,infer_normal,label,len,organ_lower,organ_normal,uri
253275,1,0,chicharras,1,1,0,Cicada
506026,0,24,Cicadidae,1,1,19,Cicada
972905,325,0,cicada,1,118,0,Cicada
1190857,401,28,Cicadas,1,163,6,Cicada
2219444,0,2,Cicadoidea,1,0,2,Cicada


In [19]:
counts_df[(counts_df.organ_normal > 0) & (counts_df.infer_lower > 0) & (counts_df.infer_normal == 0)]

Unnamed: 0,infer_lower,infer_normal,label,len,organ_lower,organ_normal,uri
2333,24,0,12/50,1,0,1,Alvis_12/50
3758,278,0,1977-1988,1,0,1,Operation_Fair_Play
8528,5,0,54.40,1,0,1,54-40
9245,4,0,中华人民共和国,1,0,1,China
9371,5,0,£194,1,0,1,Pound_sterling
10012,69,0,1741-1743,1,0,1,Russo-Swedish_War_(1741–43)
12374,53,0,$139,1,0,1,Canadian_dollar
13204,35,0,-900,1,0,2,Airbus_A350_XWB
14697,9,0,£827,1,0,1,Pound_sterling
16496,4,0,#520,1,0,1,PRR_520
