# 1. Introduction

In [71]:
#Import libraries
import pandas as pd
import numpy as np

In [54]:
#Load data files into pandas dataframes
movies_df = pd.read_csv('contentDataPrime.csv')
genres_df = pd.read_csv('contentDataGenre.csv')
regions_df = pd.read_csv('contentDataRegion.csv')

# 2. Data Loading and Preparation

## 2.1. Treat Movies Data

For some columns in the movies dataframe, missing values are assigned with "-1".  Our first step will be to replace these values with nulls. This will help us later on to handle all missing values in similar fashion. The columns in question are:

* length
* releaseYear
* endYear
* gross

In [91]:
#Handle missing values assigned as "-1"
columns_to_treat = ['length', 'releaseYear', 'endYear', 'gross']

for col in columns_to_treat:
    index = movies_df[movies_df[col] == -1].index
    movies_df.loc[index, col] = np.nan

## 2.2. Consolidate Data

In [56]:
#Consolidate data into a single dataframe
consolidated_df = movies_df.join(on='dataId', other=genres_df.set_index('dataId'))
consolidated_df = consolidated_df.join(on='dataId', other=regions_df.set_index('dataId'))

consolidated_df['genre'].fillna('', inplace=True)
consolidated_df['region'].fillna('', inplace=True)

In [62]:
consolidated_df[consolidated_df['dataId'] == 102795]

Unnamed: 0,dataId,contentType,title,length,releaseYear,endYear,votes,rating,gross,certificate,description,genre,region
0,102795,movie,Ratha Kanneer,154,1954,-1,349,8.5,-1,,"The story revolves around Mohanasundaram, a re...",Drama,India


In [76]:
#Group by genres and regions data
group_columns = list(movies_df.columns)

consolidated_df = consolidated_df.groupby(by=group_columns, as_index=False, dropna=False).agg({
    'region': lambda x: list(set(x)),
    'genre': lambda x: list(set(x))
})

In [73]:
#Create functions to remove keys with null values or empty lists from dictionary
def remove_null_from_dict(input_dict):
    return {
        k: input_dict[k] 
        for k in input_dict 
        if (
            input_dict[k] == input_dict[k] and 
            input_dict[k] != [] and 
            input_dict[k] != ['']
        )
    }

In [80]:
#Transform data into a list of dictionaries
mongodb_input_data = [
    remove_null_from_dict(record) for record in consolidated_df.to_dict(orient='records')
]

In [81]:
mongodb_input_data

[{'dataId': 0,
  'contentType': 'movie',
  'title': 'The Shawshank Redemption',
  'length': '142',
  'releaseYear': 1994,
  'endYear': -1,
  'votes': 2715939,
  'rating': 9.3,
  'gross': 28340000,
  'certificate': 'R',
  'description': 'Over the course of several years, two convicts form a friendship, seeking consolation and, eventually, redemption through basic compassion.',
  'region': ['United States'],
  'genre': ['Drama']},
 {'dataId': 1,
  'contentType': 'movie',
  'title': 'The Dark Knight',
  'length': '152',
  'releaseYear': 2008,
  'endYear': -1,
  'votes': 2688665,
  'rating': 9.0,
  'gross': 534860000,
  'certificate': 'PG-13',
  'description': 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
  'region': ['United States', 'United Kingdom'],
  'genre': ['Action', 'Crime', 'Drama']},
 {'dataId': 2,
  'contentType': 'movie',
  'title':