# Build Distribution Data

Build CSV file for data analysis for the Chilean public consultation.

## Source data

- Located in folder `./proc/`
    + `main_similarity_scores_0.7.csv` This file contains all topic-segment pairs with a semantic similarity at or above 0.7. Segments are either constitution sections or responses from the three administrative units that organised the consultation - Regional, Provincial, and ELA. 


### Secondary data sources

- Located in folder `./data/`
    + `ccp_institution_topics.csv` IDs of topics used in the analysis of Institution topics.
    + `ccp_rights_topics.csv` IDs of topics used in the analysis of Rights topics.
    + `ccp_ontology.csv` CCP topic data
    + `municipality_types.csv` maps municipality names onto onto urban, rural, and mixed types.

## Generated files

- Located in `./outputs/` folder
- `distribution_data.csv`
- `municipality_data.csv`

In [1]:

__author__      = 'Roy Gardner'
__copyright__   = 'Copyright 2023, Roy and Sally Gardner'

import csv
from datetime import datetime, timedelta

import numpy as np
import os
import time


In [2]:
# Date file containing above-threshold topic-segment pairs at or above a threshold of 0.7
export_data = []
with open('./proc/main_similarity_scores_0.7.csv','r',encoding='utf-8',errors='ignore') as f:
    reader = csv.reader(f)
    export_header = next(reader) 
    for row in reader:
        export_data.append(row)
    f.close()

# List of rights topics
rights_topics = []
with open('./data/ccp_rights_topics.csv','r',encoding='utf-8',errors='ignore') as f:
    reader = csv.reader(f)
    for row in reader:
        rights_topics.append(row[0])
    f.close()

# List of institution topics
institution_topics = []
with open('./data/ccp_institution_topics.csv','r',encoding='utf-8',errors='ignore') as f:
    reader = csv.reader(f)
    for row in reader:
        institution_topics.append(row[0])
    f.close()
    
# Topic data - labels and categories
topics_dict = {}
with open('./data/ccp_ontology.csv','r',encoding='utf-8',errors='ignore') as f:
    reader = csv.reader(f)
    _ = next(reader)
    for row in reader:
        if len(row) > 0:
            topics_dict[row[0].strip()] = (row[1],row[2])
    f.close()
    
# Get municipality type dict
municipality_type_dict = {}
with open('./data/municipality_types.csv','r',encoding='utf-8',errors='ignore') as f:
    reader = csv.reader(f)
    _ = next(reader)
    for row in reader:
        municipality_type_dict[row[1].strip()] = row[2].strip()


## Build main distribution data

In [3]:
# Collect counts
topic_counts_dict = {}


for row in export_data:
    if len(row) == 0:
        continue
    topic_id = row[export_header.index('Topic ID')].strip()
    if topic_id in topic_counts_dict:
        k = 0
        if len(row[export_header.index('Constitution score')]) > 0:
            topic_counts_dict[topic_id]['constitution'] += 1
        if len(row[export_header.index('Regional score')]) > 0:
            topic_counts_dict[topic_id]['regional'] += 1
            k += 1
        if len(row[export_header.index('Provincial score')]) > 0:   
            topic_counts_dict[topic_id]['provincial'] += 1
            k += 1
        if len(row[export_header.index('ELA score')]) > 0:   
            topic_counts_dict[topic_id]['ela'] += 1
            k += 1
        topic_counts_dict[topic_id]['aggregated'] += k
    else:
        topic_counts_dict[topic_id] = {}
        topic_counts_dict[topic_id]['constitution'] = 0
        topic_counts_dict[topic_id]['regional'] = 0
        topic_counts_dict[topic_id]['provincial'] = 0
        topic_counts_dict[topic_id]['ela'] = 0
        topic_counts_dict[topic_id]['aggregated'] = 0
        # Some topics have no above-threshold sections or responses and have row length = 4
        if len(row) > 4:
            k = 0
            if len(row[export_header.index('Constitution score')]) > 0:
                topic_counts_dict[topic_id]['constitution'] = 1
            else:
                topic_counts_dict[topic_id]['constitution'] = 0
            if len(row[export_header.index('Regional score')]) > 0:
                topic_counts_dict[topic_id]['regional'] = 1
                k += 1
            else:
                topic_counts_dict[topic_id]['regional'] = 0
            if len(row[export_header.index('Provincial score')]) > 0:   
                topic_counts_dict[topic_id]['provincial'] = 1
                k += 1
            else:
                topic_counts_dict[topic_id]['provincial'] = 0
            if len(row[export_header.index('ELA score')]) > 0:   
                topic_counts_dict[topic_id]['ela'] = 1
                k += 1
            else:
                topic_counts_dict[topic_id]['ela'] = 0
            topic_counts_dict[topic_id]['aggregated'] = k
                                
    
distributions_dict = {}

# Column totals
totals_dict = {}
totals_dict['constitution'] = sum([v['constitution'] for k,v in topic_counts_dict.items()])
totals_dict['regional'] = sum([v['regional'] for k,v in topic_counts_dict.items()])
totals_dict['provincial'] = sum([v['provincial'] for k,v in topic_counts_dict.items()])
totals_dict['ela'] = sum([v['ela'] for k,v in topic_counts_dict.items()])
totals_dict['aggregated'] = sum([v['aggregated'] for k,v in topic_counts_dict.items()])

distributions_dict = {}
for k,c_dict in topic_counts_dict.items():
    distributions_dict[k] = {}
    for data_id,v in c_dict.items():
        distributions_dict[k][data_id] = (v,v/totals_dict[data_id],round((v/totals_dict[data_id])*100,2))

### Serialize data

In [4]:
header = []
header.append('Topic ID')
header.append('Topic label')
header.append('Topic categories')
header.append('Rights topic')
header.append('Institution topic')
header.append('# Constitution sections')
header.append('Scaled Constitution sections')
header.append('% Constitution sections')
header.append('# Regional responses')
header.append('Scaled Regional responses')
header.append('% Regional responses')
header.append('# Provincial responses')
header.append('Scaled Provincial responses')
header.append('% Provincial responses')
header.append('# ELA responses')
header.append('Scaled ELA responses')
header.append('% ELA responses')
header.append('# Aggregated responses')
header.append('Scaled Aggregated responses')
header.append('% Aggregated responses')

csv_row_list = []
csv_row_list.append(header)

for topic_id,distribution_data in distributions_dict.items():
    
    csv_row = []

    csv_row.append(topic_id)
    csv_row.append(topics_dict[topic_id][1])
    topic_categories = topics_dict[topic_id][0].split('\n')
    topic_categories = [cat.split('/')[0] for cat in topic_categories if len(cat) > 0]
    cat_str = ','.join(set(topic_categories))
    
    csv_row.append(cat_str)
    
    if topic_id in rights_topics:
        csv_row.append(1)
    else:
        csv_row.append(0)
    if topic_id in institution_topics:
        csv_row.append(1)
    else:
        csv_row.append(0)
        
    for k,v in distribution_data.items():
        csv_row.append(v[0])
        csv_row.append(v[1])
        csv_row.append(v[2])
        
    csv_row_list.append(csv_row)

file_name = 'distribution_data.csv'
with open('./outputs/' + file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(csv_row_list)
f.close()
    


## Build municipality (municipal) distribution data

In [5]:
municipality_counts_dict = {}

for row in export_data:
    if len(row) == 0:
        continue
    topic_id = row[export_header.index('Topic ID')].strip()
    
    if not topic_id in municipality_counts_dict:
        municipality_counts_dict[topic_id] = {}
        municipality_counts_dict[topic_id]['provincial_urban'] = 0
        municipality_counts_dict[topic_id]['provincial_rural'] = 0
        municipality_counts_dict[topic_id]['provincial_mixed'] = 0
        municipality_counts_dict[topic_id]['ela_urban'] = 0
        municipality_counts_dict[topic_id]['ela_rural'] = 0
        municipality_counts_dict[topic_id]['ela_mixed'] = 0
        
    # Some topics have no above-threshold sections or responses and have row length = 4
    
    if len(row) > 4:
        # Note typo in headers
        prov_municipality = row[export_header.index('Provincial response comuna')].strip()
        ela_municipality = row[export_header.index('ELA response comuna')].strip()

        if len(prov_municipality) > 0 and prov_municipality in municipality_type_dict:
            prov_municipality_type = municipality_type_dict[prov_municipality].lower()
            key = 'provincial_' + prov_municipality_type
            if len(row[export_header.index('Provincial score')]) > 0:
                municipality_counts_dict[topic_id][key] += 1
            
        if len(ela_municipality) > 0 and ela_municipality in municipality_type_dict:
            ela_municipality_type = municipality_type_dict[ela_municipality].lower()
            key = 'ela_' + ela_municipality_type
            if len(row[export_header.index('ELA score')]) > 0:   
                municipality_counts_dict[topic_id][key] += 1
        
# Column totals
municipality_totals_dict = {}
municipality_totals_dict['provincial_urban'] = sum([v['provincial_urban'] for k,v in municipality_counts_dict.items()])
municipality_totals_dict['provincial_rural'] = sum([v['provincial_rural'] for k,v in municipality_counts_dict.items()])
municipality_totals_dict['provincial_mixed'] = sum([v['provincial_mixed'] for k,v in municipality_counts_dict.items()])
municipality_totals_dict['ela_urban'] = sum([v['ela_urban'] for k,v in municipality_counts_dict.items()])
municipality_totals_dict['ela_rural'] = sum([v['ela_rural'] for k,v in municipality_counts_dict.items()])
municipality_totals_dict['ela_mixed'] = sum([v['ela_mixed'] for k,v in municipality_counts_dict.items()])

municipality_distributions_dict = {}
for k,c_dict in municipality_counts_dict.items():
    municipality_distributions_dict[k] = {}
    for data_id,v in c_dict.items():
        municipality_distributions_dict[k][data_id] = (v,v/municipality_totals_dict[data_id],\
                                                  round((v/municipality_totals_dict[data_id])*100,2))


### Serialise as CSV

In [6]:
header = []
header.append('Topic ID')
header.append('Topic label')
header.append('Topic categories')
header.append('Rights topic')
header.append('Institution topic')
header.append('# Provincial Urban')
header.append('Scaled Provincial Urban')
header.append('% Provincial Urban')
header.append('# Provincial Rural')
header.append('Scaled Provincial Rural')
header.append('% Provincial Rural')
header.append('# Provincial Mixed')
header.append('Scaled Provincial Mixed')
header.append('% Provincial Mixed')
header.append('# ELA Urban')
header.append('Scaled ELA Urban')
header.append('% ELA Urban')
header.append('# ELA Rural')
header.append('Scaled ELA Rural')
header.append('% ELA Rural')
header.append('# ELA Mixed')
header.append('Scaled ELA Mixed')
header.append('% ELA Mixed')

csv_row_list = []
csv_row_list.append(header)

csv_row_list = []
csv_row_list.append(header)

for topic_id,distribution_data in municipality_distributions_dict.items():
    
    csv_row = []

    csv_row.append(topic_id)
    csv_row.append(topics_dict[topic_id][1])
    topic_categories = topics_dict[topic_id][0].split('\n')
    topic_categories = [cat.split('/')[0] for cat in topic_categories if len(cat) > 0]
    cat_str = ','.join(set(topic_categories))
    
    csv_row.append(cat_str)
    
    if topic_id in rights_topics:
        csv_row.append(1)
    else:
        csv_row.append(0)
    if topic_id in institution_topics:
        csv_row.append(1)
    else:
        csv_row.append(0)
        
    for k,v in distribution_data.items():
        csv_row.append(v[0])
        csv_row.append(v[1])
        csv_row.append(v[2])
    csv_row_list.append(csv_row)

file_name = 'municipality_data.csv'
with open('./outputs/' + file_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(csv_row_list)
f.close()
    
