# Imports

In [1]:
import requests as rq
import pandas as pd
import os
import numpy as np
import json
from dotenv import load_dotenv
from sqlalchemy import create_engine
import copy

# Definitions

In [2]:
base_urls = {
    "films": "https://swapi.dev/api/films/",
    "people": "https://swapi.dev/api/people/",
    "planets": "https://swapi.dev/api/planets/",
    "species": "https://swapi.dev/api/species/",
    "starships": "https://swapi.dev/api/starships/",
    "vehicles": "https://swapi.dev/api/vehicles/"
}

categories = list(base_urls.keys())
categories

['films', 'people', 'planets', 'species', 'starships', 'vehicles']

Each category has different fields that contain information in the form of an url. I will extract the page id from those fields for each category. 

In [3]:
fields = {
    'films' : ["characters", "planets", "starships", "vehicles", "species"],
    'people' : ["homeworld", "films", "species", "vehicles", "starships"],
    'planets' : ['residents', 'films'],
    'species' : ['people', 'films', 'homeworld'],
    'vehicles' : ['pilots', 'films'],
    'starships' : ['pilots', 'films']
    }

# Consume the API

In [4]:
def scrape_category(url):

    # skip the next url for the first page of pager
    next = url    
    items_list = []

    while next:
        # get the content of the url
        response = rq.get(next)

        # success
        if response.status_code == 200:
            content = response.json()
        elif response.status_code == 404:
            print(f'{url} not found!')
            return
        
        next = content['next']
        items = content['results']

        for item in items:
                    
            # remove created and edited fields
            try:
                del(item['created'])
                del(item['edited'])
            except:
                pass
            
            items_list.append(item)            

    return items_list

## Scrape all the categories and store in *starwars_raw.json*
(it takes 12.9 seconds)

In [None]:
if not os.path.exists('../data/starwars_raw.json'):
    raw_dict = {cat : scrape_category(base_urls[cat]) for cat in categories}
    
    os.makedirs('../data')
    # store into a json file
    with open('../data/starwars_raw.json', 'w') as file:
        json.dump(raw_dict, file, indent=4 )
    print('Content from Star Wars API stored in a json file!')

else:
    print('The content already exists in a json file!')
    with open('../data/starwars_raw.json', 'r') as file:
        raw_dict = json.load(file)

Content from Star Wars API stored in a json file!


Function to process the information of an item from a category.
Ex. one character, one planet or one film.

In [6]:
def process_item(item, fields):

    # create a copy of the item dictionary
    item = copy.deepcopy(item)

    # parse the links from starships, vehicles and species
    for field in fields:
        #print(f'Processing field: {field}')
        id_values = []
                        
        if item[field]:  # if the field is not empty
            
            # parse the homeworld (just a single string value)
            if field == 'homeworld':
                item[field] = int(item[field].split('/')[-2])

            # the content of the item[field] is a list
            # of links (empty or just one link in case of species)
            else:
                # species field needs special treatment
                if field == 'species':
                    item['species'] = int(item['species'][0].split('/')[-2])
                    # process done, keep on with next field
                    # species contains only one value,
                    # so don't convert into tuple
                    continue

                # traverse the list of links for fields
                # other than homeworld and species
                else:
                    for link in item[field]:
                        # parse the id value in the link    
                        id_values.append(int(link.split('/')[-2]))

                # add the id values into the corresponding field key
                # convert list into tuple, as tuples are hashable
                item[field] = tuple(id_values)
                    
        # field has no values (empty list)
        else:
            # species field may be empty, but that means
            # she/he is a human, so set species = 1
            if field == 'species':
                item['species'] = 1
            # otherwise, it is a field supposed to be empty
            else:
                item[field] = ()
    
    # add the id, extracted from the url
    item['id'] = int(item['url'].split('/')[-2])
    
    # remove created and edited fields
    try:
        del(item['created'])
        del(item['edited'])
    except:
        pass

    return item

# Store the processed data

In [None]:
if not os.path.exists('../data/starwars_processed_items.json'):
    # dictionary to store the processed categories
    processed_dict = {}
    
    # process each item for all the categories
    for k,v in raw_dict.items():
        items_processed = []
        for item in v:
            try:
                items_processed.append(process_item(item, fields[k]))
                processed_dict[k] = items_processed
            except:
                print(f'Error in {k}')
    
    # store the information in a json file
    with open('../data/starwars_processed_items.json', 'w') as file:
        json.dump(processed_dict, file, indent = 4)

# the file already exists, so load it
else:
    with open('../data/starwars_processed_items.json', 'r') as file:
        processed_dict= json.load(file)

        # convert lists into tuples after reading from json file
        for cat in categories:
            for item in processed_dict[cat]:
                for field in fields[cat]:
                    try:
                        item[field] = tuple(item[field])
                    # the content of the field is an integer
                    # and not a list. Cannot create a tuple from
                    # an integer using tuple(int)
                    except:
                        pass  # do not convert into a tuple, leave it as integer

    print('Processed data already existed, so the *categories_dict_processed* dictionary will be created from json file.')

# Dataframes

## Create the dataframes
Create a dictionary to store the dataframes from each category

In [8]:
dataframes = dict.fromkeys(categories)

In [None]:
for cat in categories:
    df = pd.DataFrame(processed_dict[cat])
    #df['id'] = df.index + 1

    # rename columns to add '_id' to the "fields"
    rename_dict = {field : f'{field}_id' for field in fields[cat]}
    rename_dict.update({'id' : f'{cat}_id'})
    df.rename(columns = rename_dict, inplace = True)

    # reorder the columns to place id in first place
    all_columns_but_cat_id = [col for col in df.columns if col != f'{cat}_id']
    sorted_columns = [f'{cat}_id'] + all_columns_but_cat_id
    dataframes[cat] = df[sorted_columns]

## Rename some columns
Make some renamings to the column names

In [12]:
col_rename_dict= {
    
    'films': {
        'characters_id': 'character_id',
        'films_id': 'film_id',
        'planets_id': 'planet_id',
        'episode_id': 'episode',
        'starships_id': 'starship_id',
        'vehicles_id': 'vehicle_id',
    },
    
    'people': {
        'people_id': 'character_id',
        'films_id': 'film_id',
        'vehicles_id': 'vehicle_id',
        'starships_id': 'starship_id'
    },

    'planets': {
        'planets_id': 'planet_id',
        'films_id': 'film_id'
    },

    'species': {
        'people_id': 'character_id',
        'films_id': 'film_id'
    },

    'vehicles': {
        'pilots_id': 'pilot_id',
        'vehicles_id': 'vehicle_id',
        'films_id': 'film_id'
    },

    'starships': {
        'pilots_id': 'pilot_id',
        'starships_id': 'starship_id',
        'films_id': 'film_id'
    }
}

In [13]:
for cat in categories:
    dataframes[cat].rename(columns=col_rename_dict[cat], inplace=True)

## Clean the datasets

### Clean people

In [14]:
df = dataframes['people']
df.sample(5)

Unnamed: 0,character_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld_id,film_id,species_id,vehicle_id,starship_id,url
68,70,Zam Wesell,168,55,blonde,"fair, green, yellow",yellow,unknown,female,54,"(5,)",30,"(45,)",(),https://swapi.dev/api/people/70/
45,47,Ratts Tyerel,79,15,none,"grey, blue",unknown,unknown,male,38,"(4,)",16,(),(),https://swapi.dev/api/people/47/
34,36,Jar Jar Binks,196,66,none,orange,orange,52BBY,male,8,"(4, 5)",12,(),(),https://swapi.dev/api/people/36/
51,53,Kit Fisto,196,87,none,green,black,unknown,male,44,"(4, 5, 6)",21,(),(),https://swapi.dev/api/people/53/
47,49,Gasgano,122,unknown,none,"white, blue",black,unknown,male,40,"(4,)",18,(),(),https://swapi.dev/api/people/49/


In [15]:
df.dtypes

character_id     int64
name            object
height          object
mass            object
hair_color      object
skin_color      object
eye_color       object
birth_year      object
gender          object
homeworld_id     int64
film_id         object
species_id       int64
vehicle_id      object
starship_id     object
url             object
dtype: object

In [16]:
df.mass = df.mass.replace('unknown', np.nan)
df.mass = df.mass.str.replace(',', '', regex=False)
df.mass = df.mass.astype('float') 

In [17]:
df.height = df.height.replace('unknown', np.nan).astype('float')

In [18]:
df.hair_color = df.hair_color.replace('n/a', np.nan)

In [19]:
df.birth_year = df.birth_year.str.replace('BBY', ' BBY')

### Clean films

In [20]:
df = dataframes['films']
df.sample(5)

Unnamed: 0,film_id,title,episode,opening_crawl,director,producer,release_date,character_id,planet_id,starship_id,vehicle_id,species_id,url
4,5,Attack of the Clones,2,There is unrest in the Galactic\r\nSenate. Sev...,George Lucas,Rick McCallum,2002-05-16,"(2, 3, 6, 7, 10, 11, 20, 21, 22, 33, 35, 36, 4...","(1, 8, 9, 10, 11)","(21, 32, 39, 43, 47, 48, 49, 52, 58)","(4, 44, 45, 46, 50, 51, 53, 54, 55, 56, 57)",1,https://swapi.dev/api/films/5/
3,4,The Phantom Menace,1,Turmoil has engulfed the\r\nGalactic Republic....,George Lucas,Rick McCallum,1999-05-19,"(2, 3, 10, 11, 16, 20, 21, 32, 33, 34, 35, 36,...","(1, 8, 9)","(31, 32, 39, 40, 41)","(33, 34, 35, 36, 37, 38, 42)",1,https://swapi.dev/api/films/4/
2,3,Return of the Jedi,6,Luke Skywalker has returned to\r\nhis home pla...,Richard Marquand,"Howard G. Kazanjian, George Lucas, Rick McCallum",1983-05-25,"(1, 2, 3, 4, 5, 10, 13, 14, 16, 18, 20, 21, 22...","(1, 5, 7, 8, 9)","(2, 3, 10, 11, 12, 15, 17, 22, 23, 27, 28, 29)","(8, 16, 18, 19, 24, 25, 26, 30)",1,https://swapi.dev/api/films/3/
5,6,Revenge of the Sith,3,War! The Republic is crumbling\r\nunder attack...,George Lucas,Rick McCallum,2005-05-19,"(1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 20, 21, ...","(1, 2, 5, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19)","(2, 32, 48, 59, 61, 63, 64, 65, 66, 68, 74, 75)","(33, 50, 53, 56, 60, 62, 67, 69, 70, 71, 72, 7...",1,https://swapi.dev/api/films/6/
0,1,A New Hope,4,It is a period of civil war.\r\nRebel spaceshi...,George Lucas,"Gary Kurtz, Rick McCallum",1977-05-25,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15...","(1, 2, 3)","(2, 3, 5, 9, 10, 11, 12, 13)","(4, 6, 7, 8)",1,https://swapi.dev/api/films/1/


In [21]:
df.species_id.unique()

array([1])

In [22]:
df.dtypes

film_id           int64
title            object
episode           int64
opening_crawl    object
director         object
producer         object
release_date     object
character_id     object
planet_id        object
starship_id      object
vehicle_id       object
species_id        int64
url              object
dtype: object

In [23]:
df.release_date = pd.to_datetime(df.release_date)

### Clean planets

In [24]:
df = dataframes['planets']
df.sample(5)

Unnamed: 0,planet_id,name,rotation_period,orbital_period,diameter,climate,gravity,terrain,surface_water,population,residents_id,film_id,url
23,24,Nal Hutta,87,413,12150,temperate,1 standard,"urban, oceans, swamps, bogs",unknown,7000000000,"(16,)",(),https://swapi.dev/api/planets/24/
18,19,Saleucami,26,392,14920,hot,unknown,"caves, desert, mountains, volcanoes",unknown,1400000000,(),"(6,)",https://swapi.dev/api/planets/19/
55,56,Skako,27,384,unknown,temperate,1,"urban, vines",unknown,500000000000,"(76,)",(),https://swapi.dev/api/planets/56/
42,43,Cerea,27,386,unknown,temperate,1,verdant,20,450000000,"(52,)",(),https://swapi.dev/api/planets/43/
30,31,Mon Cala,21,398,11030,temperate,1,"oceans, reefs, islands",100,27000000000,"(27,)",(),https://swapi.dev/api/planets/31/


In [25]:
df.dtypes

planet_id           int64
name               object
rotation_period    object
orbital_period     object
diameter           object
climate            object
gravity            object
terrain            object
surface_water      object
population         object
residents_id       object
film_id            object
url                object
dtype: object

In [26]:
for col in ['rotation_period', 'orbital_period', 'diameter', 'surface_water', 'population']:
    df[col] = df[col].replace('unknown', np.nan).astype('float')

In [27]:
df.population = df.population / 1E6
df.rename(columns = {'population' : 'population_millions'}, inplace = True)

In [28]:
df.gravity.unique()

array(['1 standard', '1.1 standard', 'N/A',
       '1.5 (surface), 1 standard (Cloud City)', '0.85 standard',
       '0.9 standard', '0.56 standard', '0.75 standard', 'unknown',
       '0.62 standard', '1', '1.56', '0.9', '0.98'], dtype=object)

In [29]:
df.gravity = df.gravity.str.replace(' standard', '').str.replace(df.gravity[5], '1.5')
df.gravity = df.gravity.replace('unknown', np.nan)

### Clean species

In [30]:
df = dataframes['species']
df.sample(5)

Unnamed: 0,species_id,name,classification,designation,average_height,skin_colors,hair_colors,eye_colors,average_lifespan,homeworld_id,language,character_id,film_id,url
23,24,Iktotchi,unknown,sentient,180,pink,none,orange,unknown,47,Iktotchese,"(56,)","(4, 6)",https://swapi.dev/api/species/24/
9,10,Sullustan,mammal,sentient,180,pale,none,black,unknown,33,Sullutese,"(31,)","(3,)",https://swapi.dev/api/species/10/
25,26,Kel Dor,unknown,sentient,180,"peach, orange, red",none,"black, silver",70,49,Kel Dor,"(58,)","(4, 6)",https://swapi.dev/api/species/26/
0,1,Human,mammal,sentient,180,"caucasian, black, asian, hispanic","blonde, brown, black, red","brown, blue, green, hazel, grey, amber",120,9,Galactic Basic,"(66, 67, 68, 74)","(1, 2, 3, 4, 5, 6)",https://swapi.dev/api/species/1/
32,33,Skakoan,mammal,sentient,unknown,"grey, green",none,unknown,unknown,56,Skakoan,"(76,)","(5, 6)",https://swapi.dev/api/species/33/


In [31]:
df.dtypes

species_id           int64
name                object
classification      object
designation         object
average_height      object
skin_colors         object
hair_colors         object
eye_colors          object
average_lifespan    object
homeworld_id        object
language            object
character_id        object
film_id             object
url                 object
dtype: object

In [32]:
df.average_height = df.average_height.replace('unknown', np.nan).replace('n/a', np.nan).astype('float')

In [33]:
df.average_lifespan = df.average_lifespan.replace('unknown', np.nan).replace('indefinite', 9999).astype('float')

In [34]:
df.loc[1, 'homeworld_id'] = np.nan
df.homeworld_id = df.homeworld_id.astype('float')

### Clean vehicles

In [35]:
df = dataframes['vehicles']
df.sample(5)

Unnamed: 0,vehicle_id,name,model,manufacturer,cost_in_credits,length,max_atmosphering_speed,crew,passengers,cargo_capacity,consumables,vehicle_class,pilot_id,film_id,url
36,72,Corporate Alliance tank droid,NR-N99 Persuader-class droid enforcer,Techno Union,49000,10.96,100,0,4,none,none,droid tank,(),"(6,)",https://swapi.dev/api/vehicles/72/
24,51,LAAT/c,Low Altitude Assault Transport/carrier,Rothana Heavy Engineering,unknown,28.82,620,1,0,40000,unknown,gunship,(),"(5,)",https://swapi.dev/api/vehicles/51/
1,6,T-16 skyhopper,T-16 skyhopper,Incom Corporation,14500,10.4,1200,1,1,50,0,repulsorcraft,(),"(1,)",https://swapi.dev/api/vehicles/6/
30,60,Tsmeu-6 personal wheel bike,Tsmeu-6 personal wheel bike,Z-Gomot Ternbuell Guppat Corporation,15000,3.5,330,1,1,10,none,wheeled walker,"(79,)","(6,)",https://swapi.dev/api/vehicles/60/
14,34,Multi-Troop Transport,Multi-Troop Transport,Baktoid Armor Workshop,138000,31.0,35,4,112,12000,unknown,repulsorcraft,(),"(4,)",https://swapi.dev/api/vehicles/34/


In [36]:
df.cost_in_credits = df.cost_in_credits.replace('unknown', np.nan)
df.cost_in_credits = df.cost_in_credits.astype('float')

In [37]:
for col in ['max_atmosphering_speed' ,'crew', 'passengers', 'cargo_capacity']:
    df[col] = df[col].replace('unknown', np.nan).replace('none', np.nan)
    df[col] = df[col].astype('float')

In [38]:
df.length = df.length.replace('unknown', np.nan)
df.length = df.length.astype('float')

In [39]:
df.consumables = df.consumables.replace('0', 'none')

### Clean starships

In [40]:
df = dataframes['starships']
df.sample(5)

Unnamed: 0,starship_id,name,model,manufacturer,cost_in_credits,length,max_atmosphering_speed,crew,passengers,cargo_capacity,consumables,hyperdrive_rating,MGLT,starship_class,pilot_id,film_id,url
18,39,Naboo fighter,N-1 starfighter,Theed Palace Space Vessel Engineering Corps,200000,11.0,1100.0,1,0,65,7 days,1.0,unknown,Starfighter,"(11, 35, 60)","(4, 5)",https://swapi.dev/api/starships/39/
20,41,Scimitar,Star Courier,Republic Sienar Systems,55000000,26.5,1180.0,1,6,2500000,30 days,1.5,unknown,Space Transport,"(44,)","(4,)",https://swapi.dev/api/starships/41/
13,27,Calamari Cruiser,MC80 Liberty type Star Cruiser,Mon Calamari shipyards,104000000,1200.0,,5400,1200,unknown,2 years,1.0,60,Star Cruiser,(),"(3,)",https://swapi.dev/api/starships/27/
17,32,Droid control ship,Lucrehulk-class Droid Control Ship,"Hoersch-Kessel Drive, Inc.",unknown,3170.0,,175,139000,4000000000,500 days,2.0,unknown,Droid control ship,(),"(4, 5, 6)",https://swapi.dev/api/starships/32/
4,10,Millennium Falcon,YT-1300 light freighter,Corellian Engineering Corporation,100000,34.37,1050.0,4,6,100000,2 months,0.5,75,Light freighter,"(13, 14, 25, 31)","(1, 2, 3)",https://swapi.dev/api/starships/10/


In [41]:
df.loc[0, 'crew'] = 165

In [42]:
for col in ['cost_in_credits', 'length', 'max_atmosphering_speed', 'crew', 'passengers', 'cargo_capacity', 'hyperdrive_rating', 'MGLT']:
    try:
        df[col] = df[col].replace('unknown', np.nan).replace('none', np.nan).replace('n/a', np.nan)
        df[col] = df[col].str.replace(',', '', regex = False).str.replace('km', '')
        #df[col] = df[col].astype('float')
    except Exception as e:
        print(f'Error in {col}: {e}')

In [43]:
for col in ['cost_in_credits', 'length', 'max_atmosphering_speed', 'crew', 'passengers', 'cargo_capacity', 'hyperdrive_rating', 'MGLT']:
    try:
        df[col] = df[col].astype('float')
    except:
        print(f'error with {col}')

## Export clean datasets into csv files

In [None]:
data_path = '../data'
for cat in categories:
    filename = f'{cat}_dataframe.csv'
    if os.path.exists(f'{data_path}/csv/{filename}'):
        print(f'File {filename} already exist!')
        pass
    else:
        os.makedirs(f'{data_path}/csv/', exist_ok=True)
        df = dataframes[cat]
        df.to_csv(f'{data_path}/csv/{cat}_dataframe.csv', index = False)
print(f'Dataframes of each normalized category are stored in {data_path}/csv/ as csv files!')

Dataframes of each normalized category are stored in ./data/csv/ as csv files!


# Junction tables

(many-to-many relationships in the database)

1. **films_people**: Links films to the characters that appeared in them.

    - character_id: Foreign Key referencing the `people` table.
    - film_id: Foreign Key referencing the films table.

2. **films_planets**: Links films to the planets that appeared in them.

    - planet_id: Foreign Key referencing the `planets` table.
    - film_id: Foreign Key referencing the `films` table.

3. **films_starships**: Links films to the starships that appeared in them.

    - starship_id: Foreign Key referencing the `starships` table.
    - film_id: Foreign Key referencing the `films` table.

4. **films_vehicles**: Links films to the vehicles that appeared in them.

    - vehicle_id: Foreign Key referencing the `vehicles` table.
    - film_id: Foreign Key referencing the `films` table.

5. **films_species**: Links films to the species that appeared in them.

    - species_id: Foreign Key referencing the `species` table.
    - film_id: Foreign Key referencing the `films` table.

6. **people_starships**: Links people (pilots) to the starships they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - starship_id: Foreign Key referencing the `starships` table.

7. **people_vehicles**: Links people (pilots) to the vehicles they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - vehicle_id: Foreign Key referencing the vehicles table.

In [45]:
junction_tables = [
    'people_vehicles', 
    'people_starships',
    
    'films_people',
    'films_planets',
    'films_starships',
    'films_vehicles',
    'films_species'
    ]

junction_tables_dict = {i:None for i in junction_tables}

## Junction tables for people:

In [46]:
columns = ['vehicle_id', 'starship_id']
data = dataframes['people'].loc[:, columns + ['character_id']]

for col in columns:
    table_name = f'people_{col}'.replace('_id', 's')
    junction_tables_dict[table_name] = (data.loc[:, ['character_id', col]]
                                        .explode(col)
                                        .explode('character_id')
                                        .dropna()
                                        .reset_index(drop = True)
                                        .sort_values('character_id')
                                        )

## Junction tables for films:

In [47]:
columns = ['character_id', 'species_id', 'planet_id', 'vehicle_id', 'starship_id']
data = dataframes['films'].loc[:, columns + ['film_id']]

for col in columns:
    table_name = f'films_{col}'
    if col == 'species_id':
        table_name = table_name.replace('_id', '')
    elif col == 'character_id':
        table_name = 'films_people'
    else:
        table_name = table_name.replace('_id', 's')
        
    junction_tables_dict[table_name] = (data.loc[:, ['film_id', col ]]
                                        .explode(col)
                                        .reset_index(drop = True)
                                        .sort_values('film_id')
    )

# Normalization
Next step is to normalize the datasets in order to create the database.

In [48]:
dataframes_normalized = copy.deepcopy(dataframes)

In [49]:
columns_to_drop = {
    'films' : ['character_id', 'planet_id', 'species_id', 'vehicle_id', 'starship_id'],
    'people' : ['film_id', 'vehicle_id', 'starship_id'],
    'planets' : ['residents_id', 'film_id'],
    'species' : ['character_id', 'film_id'],
    'starships' : ['pilot_id', 'film_id'],
    'vehicles' : ['pilot_id', 'film_id'],
}  

### Drop the corresponding columns in order to normalize the tables

In [50]:
for cat in dataframes_normalized.keys():
    dataframes_normalized[cat].drop(columns_to_drop[cat], axis='columns', inplace = True)

## Store the normalized dataframes

In [None]:
data_path = '../data'
for cat in categories:
    filename = f'{cat}_dataframe.csv'
    if os.path.exists(f'{data_path}/csv_normalized/{filename}'):
        print(f'File {filename} already exist!')
        pass
    else:
        os.makedirs(f'{data_path}/csv_normalized/', exist_ok=True)
        df = dataframes_normalized[cat]
        df.to_csv(f'{data_path}/csv_normalized/{cat}_dataframe_normalized.csv', index = False)
print(f'Dataframes of each normalized category are stored in {data_path}/csv_normalized/ as csv files!')

Dataframes of each normalized category are stored in ./data/csv_normalized/ as csv files!


# Insert data into the database

## Load database parameters from `.env` file

In [53]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

## Create the db connection

In [54]:
connection_string = (
    f'mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
)

# --- 4. Create the SQLAlchemy Engine ---
try:
    engine = create_engine(connection_string)
    print("SQLAlchemy Engine created successfully. 🛠️")
except Exception as e:
    print(f"Error creating engine: {e}")

SQLAlchemy Engine created successfully. 🛠️


## Populate the data into the database

The order of tables to be filled must be:
1. planets
2. species
3. starships
4. vehicles
5. films
6. people

In [55]:
categories_sorted = ['planets', 'species', 'vehicles', 'starships', 'films', 'people']

### Insert the category tables into the database

In [56]:
def insert_category(cat, dictionary):
    df = dictionary[cat]
    try:
        df.to_sql(name=cat, con=engine, if_exists='append', index=False)
        print(f"DataFrame for category '{cat}' inserted successfully into the database. ✅\n")
    except Exception as e:
        print(f"\\ Error inserting DataFrame for category '{cat}': \n{e}\n\n")

In [57]:
for cat in categories_sorted:
    query = f'select * from {cat} limit 1 ;'
    # if table is empty, fill it with the corresponding data
    if pd.read_sql(query, con = engine).shape[0] == 0:
        insert_category(cat, dataframes_normalized)
    else:
        print(f'{cat} table already exists in database!')

DataFrame for category 'planets' inserted successfully into the database. ✅

DataFrame for category 'species' inserted successfully into the database. ✅

DataFrame for category 'vehicles' inserted successfully into the database. ✅

DataFrame for category 'starships' inserted successfully into the database. ✅

DataFrame for category 'films' inserted successfully into the database. ✅

DataFrame for category 'people' inserted successfully into the database. ✅



### Insert junction tables

In [58]:
for table in junction_tables_dict.keys():
    query = f'select * from {table} limit 1 ;'
    # if table is empty, fill it with the corresponding data
    if pd.read_sql(query, con = engine).shape[0] == 0:
        insert_category(table, junction_tables_dict)
    else:
        print(f'{table} table already exists in database!') 

DataFrame for category 'people_vehicles' inserted successfully into the database. ✅

DataFrame for category 'people_starships' inserted successfully into the database. ✅

DataFrame for category 'films_people' inserted successfully into the database. ✅

DataFrame for category 'films_planets' inserted successfully into the database. ✅

DataFrame for category 'films_starships' inserted successfully into the database. ✅

DataFrame for category 'films_vehicles' inserted successfully into the database. ✅

DataFrame for category 'films_species' inserted successfully into the database. ✅



In [59]:
print('All the process finished successfully!!!')

All the process finished successfully!!!
