In [None]:
!pip install simplejson
!pip install pymongo
!pip install dnspython
!pip install 'pymongo[srv]'

import re
import sys
import json
import simplejson
import pandas as pd

from google.colab import drive
from pymongo import MongoClient
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Kaggle dataset stored to drive

In [None]:
!cp "/content/drive/MyDrive/projects/PlacementSavior/bruv.zip" "/content/"
!unzip "bruv.zip"

Archive:  bruv.zip
  inflating: credits.csv             
  inflating: keywords.csv            
  inflating: links.csv               
  inflating: links_small.csv         
  inflating: movies_metadata.csv     
  inflating: ratings.csv             
  inflating: ratings_small.csv       


# Base class 
 All other classes are inherited from this, and this contains the common methods used by the rest.

In [None]:
class BaseDataParser():
  def __init__(self,csvPath,collectionName,dbName,mongoURI):
    self.df = pd.read_csv(csvPath)
    self.jsons = []
    self.myclient = MongoClient(mongoURI)
    self.db = self.myclient[dbName]
    self.collection = self.db[collectionName]
    self.columns = self.df.columns

  def varIsNan(self,var):
    return var!=var

  def splicerArray(self,document,attribute,requiredSubAttribute):
    if(self.varIsNan(document[attribute])):
      return document[attribute]
    splicedData = [data[requiredSubAttribute] for data in document[attribute]]
    return splicedData

  def splicerSingle(self,document,attribute,requiredSubAttribute):
    if(self.varIsNan(document[attribute])):
      return document[attribute]
    return document[attribute][requiredSubAttribute]

  def jsonParser(self,input):
    try:
      ans = eval(input)
    except:
      return input
    return ans
  
  def renameColumn(self,json,oldColumnName,newColumnName):
    json[newColumnName] = json.pop(oldColumnName)
    return json
  
  def getSingleJson(self,index):
    return self.jsons[index]

  def removeJsonNaNs(self):
    refinedJsons = []
    for i in range(len(self.jsons)):
      try:
        item = self.jsons[i]
        item = simplejson.dumps(self.jsons[i], indent=4, sort_keys=True,ignore_nan=True)
        item = json.loads(item)
        refinedJsons.append(item)
      except Exception as e:
        print("Code has run into damar: "+str(e))
    self.jsons = refinedJsons

  def printJson(self,index):
    print(simplejson.dumps(self.jsons[index], indent=4, sort_keys=True,ignore_nan=True))

  def getAllJson(self):
    return self.jsons
  
  def writeToMongo(self):
    count = 0
    length = len(self.jsons)
    for item in self.jsons:
      try:
        self.collection.insert_one(item)
      except Exception as e:
        print(e)
      count+=1
      print(
        '\r'," Percentage Uploaded "+str(count*100/length),
        end=''
      )

In [None]:
class MoviesMetaDataParser(BaseDataParser):
  def __init__(self,csvPath,collectionName,dbName,mongoURI):
    super().__init__(csvPath,collectionName,mongoURI)
    self.prep()
    self.removeJsonNaNs()
    self.writeToMongo()

  def prep(self):
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        
        x['genres'] = self.splicerArray(x,'genres','id')
        x['belongs_to_collection'] = self.splicerSingle(x,'belongs_to_collection','id')
        x['production_companies'] = self.splicerArray(x,'production_companies','id')
        x['production_countries'] = self.splicerArray(x,'production_countries','iso_3166_1')
        x['spoken_languages'] = self.splicerArray(x,'spoken_languages','iso_639_1')
        x['release_date'] = str(row['release_date'])

        x = self.renameColumn(x,'genres','genre_ids')
        x['collection_id'] = x.pop('belongs_to_collection')
        x['production_company_ids'] = x.pop('production_companies')

        self.jsons.append(x)
      except Exception as e:
        print(e)


In [None]:
class GenreIdMappings(BaseDataParser):
  def __init__(self,csvPath,collectionName):
    super().__init__(csvPath,collectionName)
    self.prep()
    self.writeToMongo()

  def prep(self):
    self.genres = set()
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        for item in x['genres']:
          self.genres.add(simplejson.dumps(item, indent=4, sort_keys=True,ignore_nan=True))
      except Exception as e:
        print(e)
    self.jsons = list(self.genres)
    self.jsons = [simplejson.loads(item) for item in self.jsons]


In [None]:
class ProductionCompaniesMappings(BaseDataParser):
  def __init__(self,csvPath,collectionName):
    super().__init__(csvPath,collectionName)
    self.prep()
    self.writeToMongo()

  def prep(self):
    self.genres = set()
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        for item in x['production_companies']:
          self.genres.add(simplejson.dumps(item, indent=4, sort_keys=True,ignore_nan=True))
      except Exception as e:
        print(e)
    self.jsons = list(self.genres)
    self.jsons = [simplejson.loads(item) for item in self.jsons]


In [None]:
class ProductionCountriesMappings(BaseDataParser):
  def __init__(self,csvPath,collectionName):
    super().__init__(csvPath,collectionName)
    self.prep()
    self.writeToMongo()
  
  def prep(self):
    self.productionCountries = set()
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        for item in x['production_countries']:
          item = self.renameColumn(item,'iso_3166_1','code')
          self.productionCountries.add(simplejson.dumps(item, indent=4, sort_keys=True,ignore_nan=True))
      except Exception as e:
        print(e)
    self.jsons = list(self.productionCountries)
    self.jsons = [simplejson.loads(item) for item in self.jsons]

In [None]:
class SpokenLanguagesMappings(BaseDataParser):
  def __init__(self,csvPath,collectionName):
    super().__init__(csvPath,collectionName)
    self.prep()
    self.writeToMongo()
  
  def prep(self):
    self.spokenLanguages = set()
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        for item in x['spoken_languages']:
          item = self.renameColumn(item,'iso_639_1','code')
          self.spokenLanguages.add(simplejson.dumps(item, indent=4, sort_keys=True,ignore_nan=True))
      except Exception as e:
        print(e)
    self.jsons = list(self.spokenLanguages)
    self.jsons = [simplejson.loads(item) for item in self.jsons]

In [None]:
class CollectionMappings(BaseDataParser):
  def __init__(self,csvPath,collectionName):
    super().__init__(csvPath,collectionName)
    self.prep()
    self.writeToMongo()
  
  def prep(self):
    self.productionCountries = set()
    count = 0
    for index,row in self.df.iterrows():
      try:
        x = { column: self.jsonParser(row[column]) for column in self.columns}
        item = x['belongs_to_collection']
        self.productionCountries.add(simplejson.dumps(item, indent=4, sort_keys=True,ignore_nan=True))
      except Exception as e:
        count+=1
    self.jsons = list(self.productionCountries)
    self.jsons = [simplejson.loads(item) for item in self.jsons]
    print(count)

Replace the variables with your collection name, DB name and URI name

In [None]:
moviesMetaData = MoviesMetaDataParser("movies_metadata.csv","collection_name_here","db_name_here","mongo URI here")

  exec(code_obj, self.user_global_ns, self.user_ns)


'float' object is not subscriptable
'float' object is not subscriptable
'float' object is not subscriptable
Code has run into damar: Object of type ellipsis is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON serializable
Code has run into damar: Object of type builtin_function_or_method is not JSON ser

In [None]:
#collectionMapping = CollectionMappings('movies_metadata.csv',"collection_name_here","db_name_here","mongo URI here")
#spokenLanguagesMapping = SpokenLanguagesMappings('movies_metadata.csv',"collection_name_here","db_name_here","mongo URI here")
#productionCountriesMapping = ProductionCountriesMappings('movies_metadata.csv',"collection_name_here","db_name_here","mongo URI here")
#productionCompaniesMapping = ProductionCompaniesMappings('movies_metadata.csv',"collection_name_here","db_name_here","mongo URI here")
#genresMapping = GenreIdMappings("movies_metadata.csv","collection_name_here","db_name_here","mongo URI here")
#moviesMetaData = MoviesMetaDataParser("movies_metadata.csv","collection_name_here","db_name_here","mongo URI here")