In [1]:
import ast
import json
import pandas as pd
import boto3
from io import StringIO
import s3_file_operations as s3_ops

In [5]:
# Model the proposed schema

bucket = "de-masterclass-ricknmorty"  # S3 bucket name

# Read data from S3
print("Reading Character data from S3...")
characters_df = s3_ops.read_csv_from_s3(bucket, 'Rick&Morty/Untransformed/Character.csv')

print("Reading Episode data from S3...")
episodes_df = s3_ops.read_csv_from_s3(bucket, 'Rick&Morty/Untransformed/Episode.csv')

print("Reading Location data from S3...")
location_df = s3_ops.read_csv_from_s3(bucket, 'Rick&Morty/Untransformed/Location.csv')


# Check if data is loaded successfully
if characters_df is None or episodes_df is None or location_df is None:
    print("Error in loading data from S3")
else:
    print(f"Characters DataFrame shape: {characters_df.shape}")
    print(f"Episodes DataFrame shape: {episodes_df.shape}")
    print(f"Locations DataFrame shape: {location_df.shape}")


print("Data loaded successfully from S3")

Reading Character data from S3...
An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.
Reading Episode data from S3...
An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.
Reading Location data from S3...
An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.


AttributeError: 'bool' object has no attribute 'shape'

In [None]:
# Transform origin_id and location_id into just ints and not json then drop the old columns

def character_trans(characters_df):
    # Function to extract the ID from a URL
    extract_id = lambda x: x.split('/')[-1] if x else None

    # Using list comprehension to extract origin_id and location_id
    characters_df['origin_id'] = [
        extract_id(ast.literal_eval(record)['url']) if isinstance(record, str) else None
        for record in characters_df['origin']
    ]

    characters_df['location_id'] = [
        extract_id(ast.literal_eval(record)['url']) if isinstance(record, str) else None
        for record in characters_df['location']
    ]
    
    # Drop and rename columns
    print("Dropping and renaming columns...")
    characters_df = characters_df.drop(columns=['origin', 'location', 'episode'])

    characters_df.info()

### 2. Appearance Dataframe Creation
Here we will need to perform the following operations;
- Load the Episodes df that will be our baseline table for the appearance dataframe
- Extract character ids from the character column that consists of a list of urls belonging to all characters that appeared in that particular episode
- Explode the resulting dataframe so that we now have each episode with its respective character as a new row.
- Reset the index of the resulting dataframe so as to generate a new incremental column that will act as the primary key of the new column
- Rename the new columns to resemble the proposed schema,
    1. new_index -> ***id***
    2. id -> ***episode_id***
    3. character_ids -> ***character_id***
- Once this is done we can now drop all other pre-existing columns that belonged to the episodes dataframe and only retain the three columns generated.

In [None]:
# Appearance Table 
def appearance(episodes_df):
    appearance_df = episodes_df.copy()

    # Function to extract the ID from a URL
    character_func = lambda x: [url.split('/')[-1] for url in ast.literal_eval(x)] if isinstance(x, str) else None

    # Using list comprehension to extract character_ids
    appearance_df['character_ids'] = [
        character_func(record) if record else None
        for record in appearance_df['characters']
    ]

    # Explode the 'character_ids' column to create a row for each character ID
    expanded_df = appearance_df.explode('character_ids')

    # Reset the index to create a new 'id' column
    expanded_df = expanded_df.reset_index(drop=True).reset_index().rename(columns={'index': 'id_new'})

    # Rename columns to match the desired output
    expanded_df = expanded_df.rename(columns={'id_new': 'id', 'id': 'episode_id', 'character_ids': 'character_id'})

    # Select only the relevant columns
    expanded_df = expanded_df[['id', 'episode_id', 'character_id']]

    print(expanded_df.head())

In [None]:
# Episodes dataframe (Only drop the character column since we can fetch it using the appearance table)

