## JSON Conversion (from case distribution to JSON file)

In [1]:
import pickle
import csv
import json

In [2]:
rnc = pickle.load(open("rnc-data.pickle", "rb" ) )
rnc.keys()

dict_keys(['blogs_2013', 'fiction', 'public', 'science', 'speech'])

In [4]:
# load inflections csv
inflections = []
with open('inflection.csv','r') as infile: 
    reader = csv.DictReader(infile, delimiter="\t")
    for row in reader:
        inflections.append(row)
# load lemmas csv
lemmas = []
with open('lemma.csv','r') as infile: 
    reader = csv.DictReader(infile, delimiter="\t")
    for row in reader:
        lemmas.append(row)

In [74]:
# create hash from lemma to id
lemma_to_id = {}
for ordict in lemmas:
    lemma_to_id[ordict['lemma']] = ordict['id']

# find case distribution of a word
def get_occurrences(word):
    # look up by id
    def get_forms(lemma):
        form_dict_lst = []
        ID = lemma_to_id[lemma]
        for form_dict in inflections:
            if form_dict['lemma_id'] == ID:
                form_dict_lst.append(form_dict['form'])
        return form_dict_lst
    
    particular_forms = get_forms(word)
    instances = []
    total_word_count = 0

    for folder in rnc.keys():
        for file in rnc[folder].keys():
            if folder == 'public' and file == '434522.xhtml':
                continue
            grammarlists = rnc[folder][file]['contents']['annotatedlist']
            for grammarlist in grammarlists:
                for form, lemma, grammar in grammarlist:
                    total_word_count += 1
                    if form in particular_forms:
                        instances.append((form, grammar))
    return instances

In [75]:
import math

# breakdown by case
def get_JSON(word, prop=True, log=False):
    """creates the JSON object format (list of dicts) out of the case data. 
    prop=True means that the JSON is given as proportions. prop=False gives the
    JSON as counts"""
    
    cases = ['acc','nom','ins','dat','loc','gen']
    qtys = ['singular','plural']
    # find sg vs. pl
    # case_dict[case][sg/pl] = count
    case_dict = {}
    form_dict = {}
    for case in cases:
        case_dict[case] = {}
        form_dict[case] = {}
        for qty in qtys:
            case_dict[case][qty] = 0
            form_dict[case][qty] = ''
        
    # go through list of occurrences to filter by sg/pl counts
    for form, grammar in get_occurrences(word):
        if 'A,' in grammar or 'S,' in grammar:
            for case in cases:
                if case in grammar:
                    if 'sg' in grammar:
                        case_dict[case]['singular'] += 1
                        form_dict[case]['singular'] = form
                    if 'pl' in grammar:
                        case_dict[case]['plural'] += 1
                        form_dict[case]['plural'] = form
                            
    # if proportions instead of counts,
    # reset values in case_dict to proportions
    if prop:
        # create proportions from counts 
        total_counts = {}
        total_counts['singular'] = 0
        total_counts['plural'] = 0
        for case in cases:
            for qty in qtys:
                total_counts[qty] += case_dict[case][qty]
        # avoid division by zero error
        for qty in qtys:
            if total_counts[qty] == 0:
                total_counts[qty] = 1
        # calculate proportions
        for case in cases:
            for qty in qtys:
                case_dict[case][qty] = case_dict[case][qty] / total_counts[qty]
    if log:
        for case in cases:
            for qty in qtys:
                case_dict[case][qty] = math.log10(case_dict[case][qty] + 1)
                
    # create the JSON object for use in the radar chart
    json_list = []
    for class_name in ['singular', 'plural']:
        class_dict = dict()
        class_dict['className'] = class_name
        class_dict['axes'] = []
        for case in case_dict.keys():
            form_label = case + ' (' + form_dict[case]['singular'] + '/' + form_dict[case]['plural'] + ')'    
            case_count = case_dict[case][class_name]
            axes_dict = dict()
            axes_dict['axis'] = form_label
            axes_dict['value'] = case_dict[case][class_name]
            class_dict['axes'].append(axes_dict)
        json_list.append(class_dict)
    return json_list

In [76]:
# using the google translate api to create file names
from googletrans import Translator
translator = Translator()
def translate(text):
    return translator.translate(text).text

def write_JSON(word,prop=True,log=False):
    if prop:
        filepath = '../prop_data/' + translate(word)
    else:
        filepath = '../count_data/' + translate(word)
    if log:
        filepath = filepath + '_log'
    filepath = filepath + '.json'
    with open(filepath, 'w', encoding='utf8') as json_file:
        json.dump(get_JSON(word,prop,log), json_file, ensure_ascii=False)
    print(filepath,'written successfully.')

In [70]:
# for word in ['вилка','ложка','нож']:
#     write_JSON(word,prop=False)

../count_data/fork.json written successfully.
../count_data/the spoon.json written successfully.
../count_data/knife.json written successfully.


In [77]:
write_JSON('ложка',prop=False)

../count_data/the spoon.json written successfully.
