In [None]:
import json

jsonPath = "project_folder/breradesignweek_2019_events_20200122.json"
with open(jsonPath, mode='r', encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)

baseCompanies = [cc for cc in data
                 if cc['pacchetto_comunicazione'] == 'base']

plusCompanies = [cc for cc in data
                 if cc['pacchetto_comunicazione'] == 'plus']

sponsorCompanies = [cc for cc in data
                    if cc['pacchetto_comunicazione'] == 'sponsor']

espositori = [len(cc['espositori']) for cc in baseCompanies]

more_10 = sum(map(lambda x : x>10, espositori))
more_05 = sum(map(lambda x : 10>x>5, espositori))
less_03 = sum(map(lambda x : x<=5, espositori))

print('Companies: ' , len(espositori))
print('Companies with more than 10 espositori: ', more_10)
print('Companies with 6/10 espositori: ', more_05)
print('Companies with 5 espositori or less: ', less_03)

miniEventi = [len(cc['mini-eventi']) for cc in baseCompanies]

more_10 = sum(map(lambda x : x>=10, miniEventi))
more_05 = sum(map(lambda x : 10>x>5, miniEventi))
more_03 = sum(map(lambda x : x>3, miniEventi))
less_03 = sum(map(lambda x : x<=3, miniEventi))

print('Companies: ' , len(miniEventi))
print('Companies with more than 10 miniEventi: ', more_10)
print('Companies with 6/10 miniEventi: ', more_05)
print('Companies with 5 miniEventi or less: ', less_03)

# get mini-eventi type
vals = []
for cc in baseCompanies:
    for jj in cc['mini-eventi']:
        if jj['tipo_attivita'] not in vals:
            vals.append(jj['tipo_attivita'])

for cc in plusCompanies:
    for jj in cc['mini-eventi']:
        if jj['tipo_attivita'] not in vals:
            vals.append(jj['tipo_attivita'])


for cc in sponsorCompanies:
    for jj in cc['mini-eventi']:
        if jj['tipo_attivita'] not in vals:
            vals.append(jj['tipo_attivita'])

vals.sort()
vals


In [None]:
#https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from collections import Counter


def clusterize(km, companies):
    coords = pd.DataFrame([{
        'lat': cc['location']['latitudine'],
        'lon': cc['location']['longitudine']
    } for cc in companies]).to_numpy()

    # convert Km to radians
    kms_per_radian = 6371.0088
    epsilon = km / kms_per_radian
    
    # Find locations within km radius
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
        
    # Group clusters in a list
    num_clusters = len(set(cluster_labels))
    clustersList = [coords[cluster_labels == n].tolist() for n in range(num_clusters)]
    print('Number of clusters: {}'.format(num_clusters))

    # Create a dictionary with location ad ID for each company
    allLocations = [{
        'id': cc['id'],
        'lat': cc['location']['latitudine'],
        'lon': cc['location']['longitudine']
    } for cc in companies]

    # Convert location clusters to ID clusters
    clusters = []
    for eachCluster in clustersList:
        # count items inside each cluster to find duplicates
        c = Counter(map(tuple,eachCluster))
    
        # group duplicates locations and non duplicates locations
        sameLocations = [k for k,v in c.items() if v>1]
        otherLocations = [k for k,v in c.items() if v==1]

        # Pick the ID for each same lcation looping trough each company location
        idsCluster=[]
        for eachSameLocation in sameLocations:
            for eachLocation in allLocations:
               if eachLocation['lat'] == eachSameLocation[0] and eachLocation['lon']== eachSameLocation[1]:
                    idsCluster.append(eachLocation['id'])

        # Pick the ID for each other lcation looping trough each company location
        for eachOtherLocations in otherLocations:
            for eachLocation in allLocations:
               if eachLocation['lat']== eachOtherLocations[0] and eachLocation['lon']== eachOtherLocations[1]:
                    idsCluster.append(eachLocation['id'])
        # sort the IDs in each cluster
        idsCluster.sort()
        clusters.append(idsCluster)
    
    # sort clusters
    clusters.sort()
    return clusters

    
# import companies
jsonPath = "project_folder/breradesignweek_2019_events_20200122.json"
with open(jsonPath, mode='r', encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
companies = data

# compute clustering
clusters = clusterize(0.04, data)
#clusters

In [None]:
sortedCompanies = sorted(companies, key=lambda d: d['id'])

labels = []
for eachCompany in sortedCompanies:
        links = 'None'
        stackWdt = 'None'
        for eachCluster in clusters:
                if eachCompany['id'] == eachCluster[0] and len(eachCluster)>1:
                        links = ' '.join([str(int) for int in eachCluster[1:]])
                        stackWdt = 2
        
        label = {
                'id' : eachCompany['id'],
                'position': 'btmRgt',
                'offsetX': 0,
                'offsetY': 0,
                'link': links,
                'stackWdt': stackWdt
        }
        labels.append(label)
df = pd.DataFrame(labels)
df.to_csv('labels.csv', sep='\t', index=False)
df


In [4]:
'''
saving csv with fake ids and original company id
'''

import importlib
import file_IO
importlib.reload(file_IO)
from file_IO import loadCompanies
import pandas as pd
JSON_PATH = "project_folder/breradesignweek_2019_events_20200122.json"

allCompanies, baseCompanies, plusCompanies, sponsorCompanies = loadCompanies(JSON_PATH)

sponsorStartIndex = 1
plusStartIndex = len(sponsorCompanies)+1
baseStartIndex = len(sponsorCompanies) + len(plusCompanies)+1

allCompanies = sponsorCompanies + plusCompanies + baseCompanies

fakeIDs = []
counter = 1
for company in allCompanies:
    companyId = company['id']
    companyFakeId = counter
    fakeIDs.append({
        'ID': companyId,
        'fakeID' : companyFakeId
    })
    counter += 1

pd.DataFrame(fakeIDs).to_csv('fakeIDs.csv', sep='\t', index=False)