# 1. Introduction

In [117]:
#Import libraries
import pandas as pd
import numpy as np

from pymongo import MongoClient

In [118]:
#Load data files into pandas dataframes
movies_df = pd.read_csv('contentDataPrime.csv')
genres_df = pd.read_csv('contentDataGenre.csv')
regions_df = pd.read_csv('contentDataRegion.csv')

# 2. Data Loading and Preparation

## 2.1. Treat Movies Data

For some columns in the movies dataframe, missing values are assigned with "-1".  Our first step will be to replace these values with nulls. This will help us later on to handle all missing values in similar fashion. The columns in question are:

* length
* releaseYear
* endYear
* gross

In [119]:
#Handle missing values assigned as "-1"
columns_to_treat = ['length', 'releaseYear', 'endYear', 'gross']

for col in columns_to_treat:
    index = movies_df[movies_df[col] == -1].index
    movies_df.loc[index, col] = np.nan

In [120]:
#Identify duplicates
duplicate_movies = movies_df['dataId'].value_counts()
duplicate_movies = list(duplicate_movies[duplicate_movies > 1].index)

movies_df[movies_df['dataId'].isin(duplicate_movies)]

Unnamed: 0,dataId,contentType,title,length,releaseYear,endYear,votes,rating,gross,certificate,description
94850,6986,tvSeries,Bob Hearts Abishola,21,2019.0,,4617,6.9,,TV-PG,An American guy falls in love with his Nigeria...
95790,6986,tvSeries,Kung Fu,-1,2021.0,,4615,5.5,,TV-14,A quarter-life crisis causes a young Chinese-A...
101104,0,movie,The Shawshank Redemption,142,1994.0,,2715939,9.3,28340000.0,R,"Over the course of several years, two convicts..."
101605,0,movie,The Shawshank Redemption,142,1994.0,,2715939,9.3,28340000.0,R,"Over the course of several years, two convicts..."


In [121]:
#Remove duplicates on dataId
movies_df.drop_duplicates(subset='dataId', inplace=True)

In [122]:
#Rename dataId fields to _id (will be used as index in MongoDB)
rename_dict = {'dataId': '_id'}
movies_df.rename(columns=rename_dict, inplace=True)

## 2.2. Consolidate Data

In [123]:
#Consolidate data into a single dataframe
consolidated_df = movies_df.join(on='_id', other=genres_df.set_index('dataId'))
consolidated_df = consolidated_df.join(on='_id', other=regions_df.set_index('dataId'))

consolidated_df['genre'].fillna('', inplace=True)
consolidated_df['region'].fillna('', inplace=True)

In [124]:
#Group by genres and regions data
group_columns = list(movies_df.columns)

consolidated_df = consolidated_df.groupby(by=group_columns, as_index=False, dropna=False).agg({
    'region': lambda x: list(set(x)),
    'genre': lambda x: list(set(x))
})

In [127]:
#Create functions to remove keys with null values or empty lists from dictionary
def remove_null_from_dict(input_dict):
    return {
        k: input_dict[k] 
        for k in input_dict 
        if (
            input_dict[k] == input_dict[k] and 
            input_dict[k] != [] and 
            input_dict[k] != ['']
        )
    }

In [128]:
#Transform data into a list of dictionaries
mongodb_input_data = [
    remove_null_from_dict(record) for record in consolidated_df.to_dict(orient='records')
]

# 3. Insert Data Into MongoDB 

In [129]:
#Set up connection to MongoDB
mongodb_uri = 'mongodb://localhost:27017'
db_name = 'imdb'

client = MongoClient(mongodb_uri)
db = client[db_name]

In [130]:
#Insert data into the movies_and_tvshows collection
db.movies_and_tvshows.insert_many(mongodb_input_data)

<pymongo.results.InsertManyResult at 0x7f9b4019da00>