## Basic Imports

In [1]:
import os
import pandas as pd
import numpy as np
from decouple import config
from tabulate import tabulate

## Disable FutureWarning from pandas

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load required variables

In [None]:
current_path = os.getcwd()
parquetfile_location = '%s/data/movies.local.parquet.updated.gzip' % current_path
print(parquetfile_location)

## Load Parquet File

In [None]:
df = pd.read_parquet(parquetfile_location)
print('Parquet file loaded from %s' % parquetfile_location)

## Get unrated movies from the data

In [5]:
unrated_movies = df.query('imdbrating == "N/A"')

## Get the rated movies and cast the ratings to float

In [6]:
rated_movies = df.query('imdbrating != "N/A"').copy()
ratings = rated_movies['imdbrating'].astype(float)
rated_movies.loc[:, 'imdbrating'] = ratings

## Filter the movies with rating less than 5.5

In [7]:
filtered_by_rating = rated_movies.query('imdbrating < 5.5')
sorted_data = filtered_by_rating.sort_values(by=['imdbrating'], ascending=False)

## Transform the size in bytes to size in MB|GB

In [8]:
sorted_data["size_in_bytes"] = pd.to_numeric(sorted_data["size_in_bytes"])
sorted_data["size_in_mb"] = (sorted_data["size_in_bytes"] / 1000000).apply(np.ceil)
sorted_data["size_in_gb"] = (sorted_data["size_in_bytes"] / 1000000000).apply(np.ceil)

## Remove the "size_in_bytes" column

In [9]:
sorted_data.drop("size_in_bytes", axis=1, inplace=True)

## Print the sorted dataframe

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(sorted_data)

In [None]:
# Get the total size_in_gb for all the sorted_data dataframe
total_size_gb = sorted_data["size_in_gb"].sum()
print(f"Total size in GB: {total_size_gb}")


In [None]:
# search df for movies starting with N
df[df['title'].str.startswith('N')]
