# Lecture 8: Assessing Gender Gap Off-Screen in Serial Production (Part 2)
Date: December 1, 2023
Duration: 3 hours

## Outline


### Data analysis techniques (1 hour)
- Descriptive statistics, hypothesis testing

In [None]:
import pandas as pd
import numpy as np
import json
import xmltodict
import dask.dataframe as dd
import pymongo
import multiprocessing
from imdb import IMDb
ia = IMDb()
CPU_COUNT = multiprocessing.cpu_count()

In [None]:
#legge l'elenco di titoli da file e elimina le prime due lettere del codice
def get_data(filename):
    titles = pd.read_csv(filename, usecols=[0], names=['_id'], header=0)
    titles['_id'] = titles['_id'].str.slice_replace(start=0, stop=2, repl='')
    return titles

In [None]:
print(ia.get_movie_infoset())

In [None]:
#Questa è la funzione che scarica la scheda filmografica a partire dall'identificativo del titolo(title) 
#e la attribuisce (in formato json) alla variabile movie
def get_main(title):
    mv = ia.get_movie(title, info='main''full credits')
    movie = json.dumps(xmltodict.parse(mv.asXML()))
    return movie

In [None]:
#Questa è la funzione che applica la precedente per ciascun identificativo di titolo contenuto nel dataframe. 
#Lavora in parallelo sfruttando i core disponibili.
def dask_impl(df):
    return dd.from_pandas(df, npartitions=CPU_COUNT).apply(
    lambda row: get_main(
        row._id),
    axis=1, 
    meta=(int)
  ).compute()

In [None]:
def connect():
    client = pymongo.MongoClient('mongodb+srv://guglielmo:yj1WEEGWl9wkMI88@cluster0.lzk8t2v.mongodb.net/test?authSource=admin&replicaSet=atlas-28m4h0-shard-0&readPreference=primary&appname=MongoDB%20Compass&ssl=true')
    db = client['ProvaLezione']
    collection = db['Italiani1']
    return collection

In [None]:
def to_mongo(mov):
    collection = connect()
    pyresponse = json.loads(mov)
    idt = pyresponse['movie']['@id']
    pyresponse['_id'] = idt
    collection.insert_one(pyresponse)

In [None]:
def app(df):
    dd.from_pandas(df, npartitions=CPU_COUNT).apply((to_mongo), meta=(int)).compute()

In [None]:
def main():
    filename = 'imdb_codes.csv'
    titles = get_data(filename)
    df = dask_impl(titles)
    df.dropna(inplace=True)
    app(df)

In [None]:
main()

### Data visualization (30 minutes)
- Tools and techniques

In [None]:
def aggregation(field):
    #Inserire qui la stringa di connessione a MongoDB con il proprio nome utente e password
    client = pymongo.MongoClient('mongodb-striga-connessione')
    #Indicare qui il nome del database e della collection generati da IMDb2Mongo
    result = client['database']['collection'].aggregate([
    {
        '$match': {
            f'{field}': {
                '$exists': True, 
                '$ne': []
            }
        }
    }, {
        '$project': {
            '_id': 0, 
            f'{field}': 1, 
            'year': 1
        }
    }, {
        '$unwind': {
            'path': f'${field}'
        }
    }, {
        '$addFields':{
        f"{field}.code": f'${field}._id',
        f"{field}.role": f'{field}',
        f"{field}.year": '$year'
    }
    }, {
        '$replaceRoot': {
            'newRoot': f'${field}'
        }
    },{
        '$project': {
            '_id': 0
        }
    },{
        '$merge': {
# Inserire qui il nome della Crew Collection che verrà salvata su MongoDB
            'into':'crew-collection'
        }
    }
]) 

In [None]:
def main():
    # I fields vanno controllati e eventualmente aggiunti i nuovi
    fields = ['art department', 'art direction', 'assistant director', 'camera and electrical department', 'cast', 'casting department', 'casting director', 'cinematographer', 'composer', 'costume department', 'costume designer', 'creator', 'director', 'editor', 'editorial department', 'location management', 'make up', 'miscellaneous crew', 'music department', 'producer', 'production design', 'production manager', 'script department', 'set decoration', 'sound crew', 'special effects', 'stunt performer', 'visual effects', 'writer']
    for field in fields:
        aggregation(field)

In [None]:
main()

## Genderize dataset

In [None]:
crew = pd.read_csv("Crew_significant03112022.csv", sep=',')
gender = pd.read_csv("names_gendered_rev_ita.14.09.2022.csv", sep=',')

In [None]:
crew.rename(columns = {'code':'nconst'}, inplace = True)

In [None]:
crew_gender = crew.merge(gender, how='left')

In [None]:
crew_gender.head()

In [None]:
crew_gender.drop('name', axis=1, inplace=True)

In [None]:
crew_gender.head()

In [None]:
crew_gender.to_csv(r"crew_significant_gender.03.11.2022.csv", index=False)

### Q&A and discussion (30 minutes)