In [1]:
import pymongo

In [2]:
# Conexión al servidor local de mongodb
conex = pymongo.MongoClient()
conex.list_database_names()

['BDClientes',
 'BDPacientes',
 'BDSuCasa',
 'Provincias',
 'admin',
 'clasesabadoBD',
 'config',
 'local',
 'practica',
 'prueba']

In [3]:
# Conexión a la base de datos e inserción en la colección de documentos
db = conex.practica

In [4]:
db.list_collection_names()

['autores', 'documentos']

In [5]:
db.get_collection('autores').estimated_document_count()

1599879

In [6]:
db.get_collection('documentos').estimated_document_count()

2295473

### C1

Listado de todas las publicaciones de un autor determinado

In [53]:
%%time
c1 = db.get_collection('autores').find({"_id":"A Ozuna"},{"publications":1})

CPU times: user 63 µs, sys: 6 µs, total: 69 µs
Wall time: 73.4 µs


In [10]:
print(list(c1))

[{'_id': 'A Ozuna', 'publications': ['journals/bioinformatics/OzunaLJAN20']}]


### C2

Número de publicaciones de un autor determinado

In [49]:
pipeline_c2 = [{"$match":{"_id":"(Max) Zong-Ming Cheng"}},
               {"$project": {"number_publications":{"$size":"$publications"}}}]

In [52]:
%%time
c2 = db.autores.aggregate(pipeline_c2)

CPU times: user 1.19 ms, sys: 110 µs, total: 1.3 ms
Wall time: 935 µs


In [51]:
print(list(c2))

[{'_id': '(Max) Zong-Ming Cheng', 'number_publications': 2}]


### C3

Número de artículos en revista para el año 2018

In [25]:
pipeline_c3 = [
    {"$match":{"$and":[{"year":"2018"}, {"type":"article"}]}},
    {"$count":"number_articles_2018"}
]

In [26]:
%%time
c3 = db.documentos.aggregate(pipeline_c3)

CPU times: user 0 ns, sys: 3.57 ms, total: 3.57 ms
Wall time: 1.1 s


In [27]:
print(list(c3))

[{'number_articles_2018': 179805}]


### C4

In [73]:
pipeline_c4 = [
    {"$project": {"number_publications":{"$size":"$publications"}}},
    {"$match": {"number_publications":{"$gte":5}}},
    {"$count": "number_authors_with_5_or_more_publications"}
]

In [75]:
%%time
c4 = db.autores.aggregate(pipeline_c4)

CPU times: user 1.76 ms, sys: 174 µs, total: 1.93 ms
Wall time: 2.11 s


In [76]:
print(list(c4))

[{'number_authors_with_5_or_more_publications': 294192}]


Check

In [90]:
pipeline_c4 = [
    {"$project": {"number_publications":{"$size":"$publications"}, "publications":1}},
    {"$match": {"number_publications":{"$gte":5}}},
    {"$sample": {"size": 2}}
]

In [91]:
%%time
c4 = db.autores.aggregate(pipeline_c4)

CPU times: user 1.77 ms, sys: 176 µs, total: 1.94 ms
Wall time: 2.09 s


In [92]:
print(list(c4))

[{'_id': 'Fatemeh Jalali', 'publications': ['journals/pnc/HintonJM15', 'journals/sigmetrics/JalaliAVHAT14', 'journals/sj/JalaliZ20', 'journals/jsac/VishwanathJHAAT15', 'journals/jsac/JalaliHAAT16', 'journals/corr/abs-1808-05283', 'journals/mr/JalaliKE12', 'journals/mr/KhodadoustanJE11', 'journals/jsa/YousefpourFNKJN19'], 'number_publications': 9}, {'_id': 'Yun Shi', 'publications': ['journals/cea/ZhangZZWS19', 'journals/remotesensing/SheZWWS19', 'journals/remotesensing/DuanSSMIS15', 'journals/remotesensing/ZhaoSLHDS19', 'journals/remotesensing/JiZXSD18', 'journals/ivc/ShiJSYWSS15', 'journals/ijis/ShiNRK09', 'journals/isci/ShiRK07', 'journals/www/BiFCZYDS19', 'journals/ior/ShiCYL15', 'journals/access/LiuWGS19', 'journals/access/BiYZSXWZ18', 'journals/jors/CuiLLS17', 'journals/tgrs/YangSWZCZST07', 'journals/symmetry/LiLSWQL19', 'journals/tits/ShaoSZLS14', 'journals/tac/WuGLS19', 'journals/mmor/ShiLC17', 'journals/corr/abs-1709-05529', 'journals/ijgs/ShiGK13', 'journals/eor/CuiGSZ19', 'jo

### C5

In [143]:
pipeline_c5 = [
    {"$match": {"_id": {"$ne":None}}},
    {"$project": {"number_publications":{"$size":"$publications"}, "publications":1}},
    {"$sort": {"number_publications":-1}},
    {"$limit": 10},
    {"$unwind": "$publications"},
    {"$lookup": {"from": "documentos",
                "localField": "publications",
                "foreignField": "_id",
                "as": "publication_info"}},
    {"$match": {"$or":[{"publication_info.type": "article"},
                       {"publication_info.type": "inproceedings"}]}},
    {"$project": {"is_article": {"$cond": [{"$publication_info.type":"article"},1,0]},
                 "is_inpro": {"$cond": [{"$publication_info.type":"inproceedings"},1,0]}}},
    {"$group": "_id",
    "number_articles": {"$sum": "$is_article"},
    "number_inpro": {"$sum": "$is_inpro"}}
]

In [168]:
pipeline_c5 = [
    {"$match": {"_id": {"$ne":None}}},
    {"$project": {"number_publications":{"$size":"$publications"}, "publications":1}},
    {"$sort": {"number_publications":-1}},
    {"$limit": 10},
    {"$unwind": "$publications"},
    {"$lookup": {"from": "documentos",
                "localField": "publications",
                "foreignField": "_id",
                "as": "publication_info"}},

    {"$match": {"$or":[{"publication_info.type": "inproceedings"}]}},
    {"$project": {"type":"$publication_info.type"}},
]


In [169]:
%%time
c5 = db.autores.aggregate(pipeline_c5, allowDiskUse=True)

CPU times: user 2.1 ms, sys: 248 µs, total: 2.34 ms
Wall time: 3.36 s


In [170]:
print(list(c5))

[]
