In [1]:
import pandas as pd
import json
import re
import os
import numpy as np
import sqlite3

## Json file Loading and Data Clenaing

In [2]:
file_path = r"C:\Users\rahul\python\Data Assignment - Reunion\JSON"
clean_dict = {}
for dir,_,files in os.walk(file_path):
    for file in files:
        df = pd.read_json(dir+"/"+file)
        df_copy = df.copy()
        movie_names = df_copy.columns

        max_len = 0
        for movie in movie_names:
            data = df_copy[movie].loc[0]
            if len(data) > max_len:
                max_len = len(data)
                max_len_movie = movie
        
        data = df_copy[max_len_movie].loc[0]
        data = {re.sub(r'\s','',key).lower():value for key, value in data.items()}
        for key in data:
            data[key] = []
        data_dict = {'moviename':[]}|data
        
        for movie in movie_names:
            df_dict = df_copy[movie].loc[0]
            df_dict = {re.sub(r'\s','',key).lower():value for key, value in df_dict.items()}
            data_dict['moviename'].append(movie)
            for key in data:
                data_dict[key].append(df_dict.get(key))
        
        clean_dict[file] = pd.DataFrame(data_dict)
        clean_dict[file].replace([None,'unknown','unkown','unrated'],np.nan,inplace=True)

In [3]:
all_movies_df = pd.concat(clean_dict.values(),ignore_index=True)

## Data Normalization:

In [4]:
all_movies_df['imdbmetascore'] = pd.to_numeric(all_movies_df['imdbmetascore'])
all_movies_df['popcornscore'] = pd.to_numeric(all_movies_df['popcornscore'])
all_movies_df['tomatoscore'] = pd.to_numeric(all_movies_df['tomatoscore'])

all_movies_df['gross'] = pd.to_numeric(all_movies_df['gross'].apply(lambda x: "".join(re.findall('\d+',str(x)))))

In [5]:
all_movies_df.drop_duplicates(inplace=True)
all_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 0 to 106
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   moviename      77 non-null     object 
 1   genre          34 non-null     object 
 2   gross          21 non-null     float64
 3   imdbmetascore  34 non-null     float64
 4   popcornscore   63 non-null     float64
 5   rating         63 non-null     object 
 6   tomatoscore    62 non-null     float64
dtypes: float64(4), object(3)
memory usage: 4.8+ KB


In [6]:
movies_df = all_movies_df[['moviename','gross']].groupby(by='moviename').sum().reset_index()
movies_df['movieID'] = movies_df.index+1

In [7]:
genre_df = all_movies_df[['genre']].drop_duplicates().reset_index(drop=True)
genre_df['genreID'] = genre_df.index+1

In [8]:
rating = ['moviename','imdbmetascore', 'popcornscore','rating', 'tomatoscore']
rating_df = all_movies_df[rating].drop_duplicates()
rating_df['ratingID'] = rating_df.index+1
rating_df = movies_df[['movieID','moviename']].merge(rating_df,on='moviename',how='inner')
rating_df = rating_df.drop('moviename',axis=1)

In [9]:
movie_genre_df = all_movies_df[['moviename','genre']].drop_duplicates()
movie_genre_df = genre_df.merge(movie_genre_df,on='genre',how='inner')
movie_genre_df = movies_df.merge(movie_genre_df,on='moviename',how='inner')
movie_genre_df = movie_genre_df.drop(['moviename','gross','genre'],axis=1)

## Connect to SQL:

In [10]:
import psycopg2

conn = psycopg2.connect(
    host = 'localhost',
    database = 'moviedb',
    user = 'postgres',
    password = 'Rahul@123'
)

cursor = conn.cursor()

In [11]:
for idx,row in movies_df.iterrows():
    cursor.execute(
        '''INSERT INTO Movies (movieid,Title,gross) VALUES (%s, %s, %s) RETURNING movieid''',
        (row['movieID'],row['moviename'],row['gross'])
    )
    conn.commit()

In [12]:
for idx,row in genre_df.iterrows():
    cursor.execute(
        '''INSERT INTO genres (genreID,GenreName) VALUES (%s, %s) RETURNING genreID''',
        (row['genreID'],row['genre'])
    )
    conn.commit()

In [13]:
for idx,row in rating_df.iterrows():
    cursor.execute(
        '''INSERT INTO ratings (RatingID,MovieID,IMDB_Metascore,Popcorn_Score,Rating,Tomato_Score) VALUES (%s, %s, %s, %s, %s, %s) RETURNING RatingID''',
        (row['ratingID'],row['movieID'],row['imdbmetascore'],row['popcornscore'],row['rating'],row['tomatoscore'])
    )
    conn.commit()

In [14]:
movie_genre_df = movie_genre_df.convert_dtypes()
for idx,row in movie_genre_df.iterrows():
    cursor.execute(
        '''INSERT INTO MovieGenres (MovieID,GenreID) VALUES (%s, %s)''',
        (row['movieID'],row['genreID'])
    )
    conn.commit()