# Importo tutti i dati Netflix da MongoAtlas.

Prendo i dati da Atlas relativi a tutti i mesi.

In [1]:
import pymongo
import json
import pandas as pd

In [2]:
client = pymongo.MongoClient("mongodb+srv://giodefa:1234@data-manproject.b8aq8.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

Prendo il database generale, poi la collezione

In [3]:
db = client.get_database('exam')

In [4]:
# Vedo i nomi di tutte le collezioni contenute nel db
db.list_collection_names()

['api_netflix_aprile_1',
 'api_netflix_gennaio',
 'api_netflix',
 'api_netflix_marzo',
 'api_netflix_aprile_2',
 'api_netflix_febbraio',
 'api_netflix_aprile_3']

In [5]:
# Estraggo la collezione voluta
collections = []

# Dicembre
dicembre = db.api_netflix
collections.append(dicembre)

# Gennaio
gennaio = db.api_netflix_gennaio
collections.append(gennaio)

# Febbraio
febbraio = db.api_netflix_febbraio
collections.append(febbraio)

# Marzo
marzo = db.api_netflix_marzo
collections.append(marzo)

# Aprile
# - Le aggiungo dopo -

Conto il numero di documenti di cui è fatta ogni collezione 

In [6]:
limit = len(collections)

for i in range(limit):
    print(collections[i].estimated_document_count())

31
31
28
31


### Ottenere i documenti
Salvo in un Cursore il risultato della query che trova tutti i documenti, poi converto il cursore in una lista (lista dei documenti), infine elaboro la lista per trasformarla in un dizionario (o oggetto json): per farlo devo eliminare la chiave _"id"_ (e conseguentemente il suo valore) da ogni elemento.

In [7]:
n_collezioni = len(collections)

__data__ sarà una lista con 7 elementi: ciascuno è una lista di dizionari, cioè i documenti per ogni giorno del mese, infatti ogni elemento della lista corrisponde a una collezione nel database Mongo.

In [8]:
data = []

In [9]:
for i in range(n_collezioni):
    
    # Da ogni collezione estraggo con una query tutti i documenti, poi li manipolo
    cursore = collections[i].find({})
    l = list(cursore)

    for i in range(len(l)):
        del l[i]['_id']
        
    # Ora converto la lista dei documenti trovati in una stringa json con il metodo json.dumps
    stringa = json.dumps(l)
    
    # Il metodo json.loads converte la stringa serializzabile in una lista di dizionari (uno per ogn giorno del mese)
    data.append(json.loads(stringa))

In [10]:
sum = 0

for i in range(len(data)):
    sum = sum + len(data[i])*10
    print(len(data[i]))
    
print(sum)

31
31
28
31
1210


Per esempio: __data[0][0]__ si riferisce al primo giorno di dicembre

### Creazione del DataFrame
Ora posso finalmente creare il DataFrame con le classifiche di ogni giorno: prima procedo per l'1 gennaio, poi rifaccio lo stesso procedimento per tutti i giorni di Gennaio, cioè tutti gli elementi della lista data.

In [11]:
data[0][0].keys()  # il dizionario data[0]['list'] ci interessa

dict_keys(['set', 'streaming', 'region', 'from', 'to', 'country', 'year', 'type', 'genre', 'company', 'filter', 'query', 'api_key', 'api_limit', 'api_count', 'page', 'results', 'list', '_links'])

Creazione di DataFrame vuoti a cui concatenare i successivi

In [12]:
# Dati generali
netflix = {'result' : [], 'id' : [], 'name' : [], 'url' : [], 'imdb' : [], 'tmdb' : [], 'premiere' : [], 
           'type_id' : [], 'type' : [], 'country_id' : [], 'country' : [], 'genre_id' : [], 'genre' : [],
           'company_id' : [], 'company' : [], 'key' : [], 'note' : [], 'set' : [], 'region' : [], 'streaming' : [], 
           'from' : [], 'to' : [], 'ranking' : [], 'ranking_last' : [], 'value' : [], 'value_last' : [], 
           'value_total' : [], 'countries' : [], 'days' : []}

netflix = pd.DataFrame(data = netflix)

# Classifica generale
general = netflix

# Film
movies = netflix

# Serie tv
tv_series = netflix

In [13]:
n_mesi = len(data)

for i in range(n_mesi):
    
    # Ripeto per ogni collezione (= elemento della lista data)
    mese = data[i]
    
    limit = len(mese)
    
    for j in range(limit):

        # Dataframe parziale con tutti i dati per il giorno j del mese i-esimo
        df = pd.DataFrame(data = mese[j]['list'])
        
        # Dataframe generale con tutti i dati
        netflix = pd.concat([netflix, df], ignore_index = True) 

        # Top ten generale
        general = pd.concat([general, df[:10].copy()], ignore_index = True) 

        # Top ten film
        movies = pd.concat([movies, df[10:20].copy()], ignore_index=True) 

        # Top ten serie tv
        tv_series = pd.concat([tv_series, df[20:].copy()], ignore_index = True) 

In [24]:
netflix.head(3)

Unnamed: 0,result,id,name,url,imdb,tmdb,premiere,type_id,type,country_id,...,streaming,from,to,ranking,ranking_last,value,value_last,value_total,countries,days
0,1.0,97483.0,The Queen&#39;s Gambit,https://flixpatrol.com/title/the-queens-gambit,10048342,87739,2020.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,1.0,0.0,10.0,0.0,0.0,0.0,0.0
1,2.0,72764.0,Virgin River,https://flixpatrol.com/title/virgin-river,9077530,88324,2019.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,2.0,0.0,9.0,0.0,0.0,0.0,0.0
2,3.0,99577.0,The Beast,https://flixpatrol.com/title/the-beast-2020,11499506,654905,2020.0,1.0,Movie,93.0,...,656.0,2020-12-01,2020-12-01,3.0,0.0,8.0,0.0,0.0,0.0,0.0


In [25]:
general.head(3)

Unnamed: 0,result,id,name,url,imdb,tmdb,premiere,type_id,type,country_id,...,streaming,from,to,ranking,ranking_last,value,value_last,value_total,countries,days
0,1.0,97483.0,The Queen&#39;s Gambit,https://flixpatrol.com/title/the-queens-gambit,10048342,87739,2020.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,1.0,0.0,10.0,0.0,0.0,0.0,0.0
1,2.0,72764.0,Virgin River,https://flixpatrol.com/title/virgin-river,9077530,88324,2019.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,2.0,0.0,9.0,0.0,0.0,0.0,0.0
2,3.0,99577.0,The Beast,https://flixpatrol.com/title/the-beast-2020,11499506,654905,2020.0,1.0,Movie,93.0,...,656.0,2020-12-01,2020-12-01,3.0,0.0,8.0,0.0,0.0,0.0,0.0


In [26]:
movies.head(3)

Unnamed: 0,result,id,name,url,imdb,tmdb,premiere,type_id,type,country_id,...,streaming,from,to,ranking,ranking_last,value,value_last,value_total,countries,days
0,11.0,99577.0,The Beast,https://flixpatrol.com/title/the-beast-2020,11499506,654905,2020.0,1.0,Movie,93.0,...,656.0,2020-12-01,2020-12-01,1.0,1.0,10.0,0.0,0.0,0.0,0.0
1,12.0,67174.0,Diego Maradona,https://flixpatrol.com/title/diego-maradona,5433114,536841,2019.0,1.0,Movie,89.0,...,656.0,2020-12-01,2020-12-01,2.0,3.0,9.0,0.0,0.0,0.0,0.0
2,13.0,73731.0,The Christmas Chronicles: Part Two,https://flixpatrol.com/title/the-christmas-chr...,11057644,654028,2020.0,1.0,Movie,333.0,...,656.0,2020-12-01,2020-12-01,3.0,2.0,8.0,0.0,0.0,0.0,0.0


In [27]:
tv_series.head(3)

Unnamed: 0,result,id,name,url,imdb,tmdb,premiere,type_id,type,country_id,...,streaming,from,to,ranking,ranking_last,value,value_last,value_total,countries,days
0,21.0,97483.0,The Queen&#39;s Gambit,https://flixpatrol.com/title/the-queens-gambit,10048342,87739,2020.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,1.0,1.0,10.0,0.0,0.0,0.0,0.0
1,22.0,72764.0,Virgin River,https://flixpatrol.com/title/virgin-river,9077530,88324,2019.0,2.0,TV,4672.0,...,656.0,2020-12-01,2020-12-01,2.0,2.0,9.0,0.0,0.0,0.0,0.0
2,23.0,35210.0,The Crown,https://flixpatrol.com/title/the-crown,4786824,65494,2016.0,2.0,TV,89.0,...,656.0,2020-12-01,2020-12-01,3.0,3.0,8.0,0.0,0.0,0.0,0.0


Esporto i dati ottenuti in file csv

In [29]:
netflix.to_csv("Data_flixpatrol/Dati_netflix_dicembre-aprile/netflix_dic-mar2021.csv")
general.to_csv("Data_flixpatrol/Dati_netflix_dicembre-aprile/top_ten_dic-mar2021.csv")
movies.to_csv("Data_flixpatrol/Dati_netflix_dicembre-aprile/top_ten_film_dic-mar2021.csv")
tv_series.to_csv("Data_flixpatrol/Dati_netflix_dicembre-aprile/top_ten_serie_dic-mar2021.csv")

In [19]:
df = pd.DataFrame(data = data[0][20]['list'])

In [20]:
df[20:]

Unnamed: 0,result,id,name,url,imdb,tmdb,premiere,type_id,type,country_id,...,streaming,from,to,ranking,ranking_last,value,value_last,value_total,countries,days
20,21,99253,Tiny Pretty Things,https://flixpatrol.com/title/tiny-pretty-things,10767748,113246,2020,2,TV,4672,...,656,2020-12-21,2020-12-21,1,1,10,0,0,0,0
21,22,98975,The Mess You Leave Behind,https://flixpatrol.com/title/the-mess-you-leav...,9731242,94413,2020,2,TV,90,...,656,2020-12-21,2020-12-21,2,2,9,0,0,0,0
22,23,97483,The Queen&#39;s Gambit,https://flixpatrol.com/title/the-queens-gambit,10048342,87739,2020,2,TV,4672,...,656,2020-12-21,2020-12-21,3,3,8,0,0,0,0
23,24,72750,Home for Christmas,https://flixpatrol.com/title/home-for-christmas,10069398,95612,2019,2,TV,6847,...,656,2020-12-21,2020-12-21,4,4,7,0,0,0,0
24,25,97944,Alice in Borderland,https://flixpatrol.com/title/alice-in-borderland,10795658,110316,2020,2,TV,6840,...,656,2020-12-21,2020-12-21,5,5,6,0,0,0,0
25,26,35210,The Crown,https://flixpatrol.com/title/the-crown,4786824,65494,2016,2,TV,89,...,656,2020-12-21,2020-12-21,6,7,5,0,0,0,0
26,27,42910,Star Trek: Discovery,https://flixpatrol.com/title/star-trek-discovery,5171438,67198,2017,2,TV,4672,...,656,2020-12-21,2020-12-21,7,6,4,0,0,0,0
27,28,29093,The 100,https://flixpatrol.com/title/the-100,2661044,48866,2014,2,TV,4672,...,656,2020-12-21,2020-12-21,8,9,3,0,0,0,0
28,29,99287,The Ripper,https://flixpatrol.com/title/the-ripper,13492374,113372,2020,2,TV,89,...,656,2020-12-21,2020-12-21,9,8,2,0,0,0,0
29,30,72764,Virgin River,https://flixpatrol.com/title/virgin-river,9077530,88324,2019,2,TV,4672,...,656,2020-12-21,2020-12-21,10,10,1,0,0,0,0


In [21]:
data[0][0]

{'set': 123,
 'streaming': 656,
 'region': 93,
 'from': '2020-12-1',
 'to': '2020-12-1',
 'country': 0,
 'year': 0,
 'type': 0,
 'genre': 0,
 'company': 0,
 'filter': None,
 'query': None,
 'api_key': '7dhd6194kdk4',
 'api_limit': 1000,
 'api_count': 12,
 'page': 1,
 'results': 30,
 'list': [{'result': 1,
   'id': 97483,
   'name': 'The Queen&#39;s Gambit',
   'url': 'https://flixpatrol.com/title/the-queens-gambit',
   'imdb': '10048342',
   'tmdb': '87739',
   'premiere': 2020,
   'type_id': 2,
   'type': 'TV',
   'country_id': 4672,
   'country': 'USA',
   'genre_id': 6824,
   'genre': 'Drama',
   'company_id': 656,
   'company': 'Netflix',
   'key': '80234304',
   'note': '80234304',
   'set': 1,
   'region': 93,
   'streaming': 656,
   'from': '2020-12-01',
   'to': '2020-12-01',
   'ranking': 1,
   'ranking_last': 0,
   'value': 10,
   'value_last': 0,
   'value_total': 0,
   'countries': 0,
   'days': 0},
  {'result': 2,
   'id': 72764,
   'name': 'Virgin River',
   'url': 'https

In [23]:
df[['name','type','set']]

Unnamed: 0,name,type,set
0,Tiny Pretty Things,TV,1
1,The Mess You Leave Behind,TV,1
2,Rose Island,Movie,1
3,A California Christmas,Movie,1
4,The Queen&#39;s Gambit,TV,1
5,Home for Christmas,TV,1
6,Alice in Borderland,TV,1
7,The Crown,TV,1
8,How the Grinch Stole Christmas,Movie,1
9,The Christmas Chronicles: Part Two,Movie,1
