# Homework 3 - MongoDB

In [1]:
import json
import os
import pymongo
from pymongo import ASCENDING, DESCENDING
import pandas as pd
from datetime import date, datetime, timedelta

## Lettura dei dati

In [2]:
dataset_dir = './dataset/'
dati_covid_path = os.path.join(dataset_dir, 'dpc-covid19-ita-regioni.json')
dati_vaccini_path = os.path.join(dataset_dir, 'somministrazioni-vaccini-latest.json')

dati_covid = []
with open(dati_covid_path, 'r') as j:
    dati_covid = json.load(j)

dati_vaccini = []
with open(dati_vaccini_path, 'r') as j:
    dati_vaccini = json.load(j)
dati_vaccini = dati_vaccini['data']

## Creazione del db
Si crea il database e due collection contenenti rispettivamente dati relativi al Covid-19 e alle vaccinazioni.

In [3]:
client = pymongo.MongoClient('localhost', 27017)

db = client['homework3']

collection_covid = db.dati_covid
collection_vaccini = db.dati_vaccini

print('Available database: ', client.list_database_names())
print('Available collections: ', db.list_collection_names())

Available database:  ['admin', 'config', 'homework3', 'local']
Available collections:  ['dati_covid', 'dati_vaccini']


Si inseriscono i dati nelle rispettive collection (solo se il database è stato appena creato).

In [4]:
# collection_covid.insert_many(dati_covid)
# collection_vaccini.insert_many(dati_vaccini)

In [5]:
print('Available database: ', client.list_database_names())
print('Available collections: ', db.list_collection_names())

Available database:  ['admin', 'config', 'homework3', 'local']
Available collections:  ['dati_covid', 'dati_vaccini']


## Preprocessing
Si convertono i campi relativi alle date dal formato `string` al formato `date`.

In [6]:
collection_covid.aggregate([
    { "$addFields": {
        "data": {
            "$toDate": "$data"
            }
        } 
    },
    { "$out" : "dati_covid" }
])

<pymongo.command_cursor.CommandCursor at 0x187c9d231d0>

In [7]:
collection_vaccini.aggregate([
    { "$addFields": {
        "data_somministrazione": {
            "$toDate": "$data_somministrazione"
            }
        } 
    },
    { "$out" : "dati_vaccini" }
])

<pymongo.command_cursor.CommandCursor at 0x187d444c5f8>

## Query

### Dati Covid

#### Attuali positivi e terapie intensive per regione

In [8]:
days_ago = 10
date = datetime.today() - timedelta(days=days_ago)

start = date.replace(hour=0, minute=0, second=0)
end = date.replace(hour=23, minute=59, second=59)

filter = {
    'data': {
        '$gt': start,
        '$lt': end
    }
}

project = {
    'data': 1, 
    'denominazione_regione': 1, 
    'totale_positivi': 1,
    'terapia_intensiva': 1,
    '_id': 0
}

sort = list({
    'totale_positivi': -1
}.items())

result = collection_covid.find(
    filter=filter,
    projection=project,
    sort=sort
)

result_list = list(result)
# len(result_list)

In [9]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,data,denominazione_regione,terapia_intensiva,totale_positivi
0,2021-05-12 17:00:00,Campania,116,82418
1,2021-05-12 17:00:00,Puglia,161,41549
2,2021-05-12 17:00:00,Lombardia,448,39871
3,2021-05-12 17:00:00,Lazio,244,35575
4,2021-05-12 17:00:00,Emilia-Romagna,178,26531
5,2021-05-12 17:00:00,Sicilia,125,20035
6,2021-05-12 17:00:00,Veneto,110,17635
7,2021-05-12 17:00:00,Toscana,188,15916
8,2021-05-12 17:00:00,Sardegna,38,14952
9,2021-05-12 17:00:00,Calabria,28,12958


#### Nuovi casi e tamponi nell'ultima settimana per regione

In [10]:
weeks_ago = 2
date = datetime.today() - timedelta(weeks=weeks_ago)

result = collection_covid.aggregate([
    { "$match": { "data": { "$gte": date } } },

    { "$group": {
        "_id": "$denominazione_regione",
        "nuovi_casi": { "$sum": "$nuovi_positivi" },
        "tamponi" : { "$sum": "$tamponi"}
    }},
    
    { "$sort" : { "nuovi_casi": -1 } }
])

result_list = list(result)
# len(result_list)

In [11]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,nuovi_casi,tamponi
0,Campania,8055,31998263
1,Lombardia,8035,69543808
2,Lazio,5095,45307050
3,Sicilia,4611,29007050
4,Puglia,4316,16345523
5,Piemonte,4261,29154217
6,Emilia-Romagna,4130,40948173
7,Toscana,3912,30549597
8,Veneto,3107,52265132
9,Calabria,1911,5674956


#### Totale casi e deceduti ad ogni mese

In [12]:
result = collection_covid.aggregate([
    { "$group" : {
        "_id": {
            "anno" : {"$year": "$data"},
            "mese" : {"$month": "$data"},
#             "settimana" : {"$week": "$data"},
            "regione" : "$denominazione_regione"
        },
        "deceduti": { "$max": "$deceduti" },
        "casi": { "$max": "$totale_casi" }
    }},
    
    { "$project": {
        "anno": "$_id.anno",
        "mese": "$_id.mese",
        "regione": "$_id.regione",
        "deceduti": 1,
        "casi": 1,
        "_id": 0
    }},
    
    { "$group" : {
        "_id": {
            "anno" : "$anno",
            "mese" : "$mese",
        },
        "deceduti": { "$sum": "$deceduti" },
        "casi": { "$sum": "$casi" }
    }},
    
    { "$sort" : {"_id": -1} }
])

result_list = list(result)
# len(result_list)

In [13]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,casi,deceduti
0,"{'anno': 2021, 'mese': 5}",4146722,123927
1,"{'anno': 2021, 'mese': 4}",4022653,120807
2,"{'anno': 2021, 'mese': 3}",3583444,109346
3,"{'anno': 2021, 'mese': 2}",2925265,97699
4,"{'anno': 2021, 'mese': 1}",2553032,88516
5,"{'anno': 2020, 'mese': 12}",2107166,74159
6,"{'anno': 2020, 'mese': 11}",1601554,55576
7,"{'anno': 2020, 'mese': 10}",679430,38618
8,"{'anno': 2020, 'mese': 9}",314861,35894
9,"{'anno': 2020, 'mese': 8}",269214,35483


### Dati Vaccini

#### Totale prima e seconda dose per regione

In [14]:
result = collection_vaccini.aggregate([
    { "$group": {
        "_id": "$nome_area",
        "prima_dose": { "$sum": "$prima_dose" },
        "seconda_dose" : { "$sum": "$seconda_dose"}
    }},
    
    { "$sort" : { "prima_dose": -1 } }
])

result_list = list(result)
# len(result_list)

In [15]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,prima_dose,seconda_dose
0,Lombardia,3333093,1254243
1,Lazio,1741912,801151
2,Campania,1735583,657414
3,Veneto,1601028,644776
4,Emilia-Romagna,1399348,673899
5,Piemonte,1331135,647416
6,Sicilia,1279452,598271
7,Puglia,1262044,542430
8,Toscana,1067067,567157
9,Calabria,521179,229618


#### Totale prima e seconda dose per fascia anagrafica

In [15]:
result = collection_vaccini.aggregate([
    { "$group": {
        "_id": "$fascia_anagrafica",
        "prima_dose": { "$sum": "$prima_dose" },
        "seconda_dose" : { "$sum": "$seconda_dose"}
    }},
    
    { "$sort" : { "prima_dose": -1 } }
])

result_list = list(result)
# len(result_list)

In [17]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,prima_dose,seconda_dose
0,70-79,4553747,1196929
1,60-69,4115406,904939
2,80-89,3266208,2873577
3,50-59,2555592,940924
4,40-49,1460247,679053
5,30-39,970612,486228
6,90+,743029,616487
7,20-29,700466,353475
8,16-19,50862,17421


#### Totale dosi somministrate per categoria

In [16]:
result = collection_vaccini.aggregate([
    { "$group": {
        "_id": "null",
        "personale_scolastico": { "$sum": "$categoria_personale_scolastico" },
        "soggetti_fragili": { "$sum": "$categoria_soggetti_fragili" },
        "operatori_sanitari_sociosanitari": { "$sum": "$categoria_operatori_sanitari_sociosanitari" },
        "personale_non_sanitario": { "$sum": "$categoria_personale_non_sanitario" },
        "60_69": { "$sum": "$categoria_60_69" },
        "over80": { "$sum": "$categoria_over80" },
        "altro": { "$sum": "$categoria_altro" },
        "ospiti_rsa": { "$sum": "$categoria_ospiti_rsa" },
        "70_79": { "$sum": "$categoria_70_79" },
        "forze_armate": { "$sum": "$categoria_forze_armate" }
    }},
    
    { "$project": {
        "_id": 0
    }}
])

result_list = list(result)
# len(result_list)

In [17]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,60_69,70_79,altro,forze_armate,operatori_sanitari_sociosanitari,ospiti_rsa,over80,personale_non_sanitario,personale_scolastico,soggetti_fragili
0,2642444,4089924,1017063,400998,3367525,682137,6763142,944150,1336497,5241322


#### Fornitore massimo per fascia anagrafica

In [18]:
result = collection_vaccini.aggregate([
    { "$group" : {
        "_id": {
            "fascia_anagrafica": "$fascia_anagrafica",
            "fornitore": "$fornitore"
        },
        "prima_dose": { "$sum": "$prima_dose" },
        "seconda_dose" : { "$sum": "$seconda_dose"}
    }},
    
    { "$addFields": {
        "totale_dosi": { "$sum": ["$prima_dose", "$seconda_dose"]}
    }},
    
    { "$project": {
        "fascia_anagrafica": "$_id.fascia_anagrafica",
        "fornitore": "$_id.fornitore",
        "totale_dosi": 1,
        "_id": 0
    }},
    
    { "$sort" : {
        "fascia_anagrafica": -1, 
        "totale_dosi": -1
    }},
    
    { "$group" : {
        "_id": "$fascia_anagrafica",
        "totale_dosi": {"$first": "$totale_dosi"},
        "fornitore": {"$first": "$fornitore" }
    }}, 
    
    { "$sort" : {
        "_id": -1, 
    }},
])

result_list = list(result)
# len(result_list)

In [19]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,fornitore,totale_dosi
0,90+,Pfizer/BioNTech,1137305
1,80-89,Pfizer/BioNTech,5437376
2,70-79,Pfizer/BioNTech,3109782
3,60-69,Pfizer/BioNTech,2927722
4,50-59,Pfizer/BioNTech,2509766
5,40-49,Pfizer/BioNTech,1450754
6,30-39,Pfizer/BioNTech,1024311
7,20-29,Pfizer/BioNTech,784369
8,16-19,Pfizer/BioNTech,59554


#### Fascia anagrafica più vaccinata per ogni mese

In [20]:
result = collection_vaccini.aggregate([
    { "$group" : {
        "_id": {
            "anno" : {"$year": "$data_somministrazione"},
            "mese" : {"$month": "$data_somministrazione"},
            "fascia_anagrafica" : "$fascia_anagrafica"
        },
        "prima_dose": { "$sum": "$prima_dose" },
        "seconda_dose" : { "$sum": "$seconda_dose"}
    }},
    
    { "$addFields": {
        "totale_dosi": { "$sum": ["$prima_dose", "$seconda_dose"]}
    }},    
        
    { "$project": {
        "anno": "$_id.anno",
        "mese": "$_id.mese",
        "fascia_anagrafica": "$_id.fascia_anagrafica",
        "totale_dosi": 1,
        "_id": 0
    }},
    
    { "$sort" : {
        "anno": -1,
        "mese": 1,        
        "totale_dosi": -1
    }},
    
    { "$group" : {
        "_id": {
            "anno": "$anno",
            "mese": "$mese"
        },
        "totale_dosi": {"$first": "$totale_dosi"},
        "fascia_anagrafica": {"$first": "$fascia_anagrafica" }
    }}, 
    
    { "$sort" : {
        "_id": -1, 
    }},
    
#     { "$sort" : {"_id.anno": -1, "_id.mese": -1} }
])

result_list = list(result)
# len(result_list)

In [21]:
df = pd.DataFrame(result_list)
df

Unnamed: 0,_id,fascia_anagrafica,totale_dosi
0,"{'anno': 2021, 'mese': 5}",60-69,2144753
1,"{'anno': 2021, 'mese': 4}",70-79,3409215
2,"{'anno': 2021, 'mese': 3}",80-89,2422442
3,"{'anno': 2021, 'mese': 2}",80-89,613353
4,"{'anno': 2021, 'mese': 1}",50-59,523036
5,"{'anno': 2020, 'mese': 12}",50-59,11265


## Indexes

In [22]:
print(list(collection_covid.list_indexes()))

[SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')])]


### Query without index

In [37]:
days_ago = 10
date = datetime.today() - timedelta(days=days_ago)

start = date.replace(hour=0, minute=0, second=0)
end = date.replace(hour=23, minute=59, second=59)

filter = {
    'data': {
        '$gt': start,
        '$lt': end
    }
}

project = {
    'data': 1, 
    'denominazione_regione': 1, 
    'totale_positivi': 1,
    'terapia_intensiva': 1,
    '_id': 0
}

sort = list({
    'totale_positivi': -1
}.items())

result = collection_covid.find(
    filter=filter,
    projection=project,
    sort=sort
)

explain_no_ind = result.explain()
explain_no_ind = explain_no_ind['executionStats']

In [44]:
print('executionTimeMillis:', explain_no_ind['executionTimeMillis'])
print('nReturned:', explain_no_ind['nReturned'])
print('totalDocsExamined:', explain_no_ind['totalDocsExamined'])

executionTimeMillis: 16
nReturned: 21
totalDocsExamined: 9366


### Query with index

In [39]:
collection_covid.create_index([("data", ASCENDING)], 
                              name="date_index",
                              background=True,
                              unique=False,
                              sparse=False)

'date_index'

In [40]:
print(list(collection_covid.list_indexes()))

[SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')]), SON([('v', 2), ('key', SON([('data', 1)])), ('name', 'date_index'), ('background', True), ('sparse', False)])]


In [41]:
days_ago = 10
date = datetime.today() - timedelta(days=days_ago)

start = date.replace(hour=0, minute=0, second=0)
end = date.replace(hour=23, minute=59, second=59)

filter = {
    'data': {
        '$gt': start,
        '$lt': end
    }
}

project = {
    'data': 1, 
    'denominazione_regione': 1, 
    'totale_positivi': 1,
    'terapia_intensiva': 1,
    '_id': 0
}

sort = list({
    'totale_positivi': -1
}.items())

result = collection_covid.find(
    filter=filter,
    projection=project,
    sort=sort
)

explain_ind = result.explain()
explain_ind = explain_ind['executionStats']

In [43]:
print('executionTimeMillis:', explain_ind['executionTimeMillis'])
print('nReturned:', explain_ind['nReturned'])
print('totalDocsExamined:', explain_ind['totalDocsExamined'])

executionTimeMillis: 9
nReturned: 21
totalDocsExamined: 21
