# Imports

In [1]:
import requests as rq
import pandas as pd
import os
import numpy as np
import json
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
def show_cols(dict_):
    for cat in dict_.keys():
        print(f'\n\nCategory: {cat}')
        print(*[ i for i in dict_[cat].columns], sep = '\n')

## Definitions

In [3]:
base_urls = {
    "films": "https://swapi.dev/api/films/",
    "people": "https://swapi.dev/api/people/",
    "planets": "https://swapi.dev/api/planets/",
    "species": "https://swapi.dev/api/species/",
    "starships": "https://swapi.dev/api/starships/",
    "vehicles": "https://swapi.dev/api/vehicles/"
}

In [4]:
categories = list(base_urls.keys())
categories

['films', 'people', 'planets', 'species', 'starships', 'vehicles']

In [5]:
fields = {
    'people' : ["homeworld", "films", "species", "vehicles", "starships"],
    'planets' : ['residents', 'films'],
    'films' : ["characters", "planets", "starships", "vehicles", "species"],
    'species' : ['people', 'films', 'homeworld'],
    'vehicles' : ['pilots', 'films'],
    'starships' : ['pilots', 'films']
    }

# Consume the API

In [6]:
def get_page_items(url, fields):
        
    # get the content of the url
    response = rq.get(url)

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{url} not found!')
        return
    
    items_list = []

    next = content['next']
    items = content['results']

    for item in items:

        for field in fields:
            id_values = []
                           
            if item[field]:  # if the field is not empty
                # parse the links from starships, vehicles and starships
                if field != 'homeworld':  
                    for link in item[field]:
                        # parse the id value in the link    
                        id_values.append(int(link.split('/')[-2]))
                    # add the id values into the corresponding field key
                    # convert list into tuple, as tuples are hashable
                    # each character belongs to only 1 species
                    if field != 'species':
                        item[field] = tuple(id_values)
                    else:
                        item[field] = id_values[0]
                        
                # parse the homeworld (just a single string value)
                else:
                    # get the homeworld id
                    item['homeworld'] = int(item['homeworld'].split('/')[-2])
            
            # parse species field
            # in case of human characters, the species field is an empty list
            elif field == 'species' and not item[field]:
                item[field] = 1
            
            # field has no values (empty list)
            else:
                item[field] = ()
                  
        # remove created and edited fields
        try:
            del(item['created'])
            del(item['edited'])
        except:
            pass
        
        items_list.append(item)

    return next, items_list

Scrape all the information from the Star Wars API, for all the available categories

In [7]:
filepath = './data/starwars.json'
if not os.path.exists(filepath):
    items = dict.fromkeys(categories)

    for category in categories:
        items_list = []
        url = base_urls[category]
        category_fields = fields[category]
        while url:
            url, page_items = get_page_items(url, category_fields)
            items_list.extend(page_items)
        
        items[category] = items_list

        print(f'{category} successfully scrapped!')

    print('\n\nWhole database fully scrapped!')
    print('\nNow the information will be stored in a json file...')

    # store the information in a json file
    if not os.path.exists(filepath):
        with open(filepath, 'w') as file:
            json.dump(items, file, indent=4)

        # remove the carriage return character
        with open(filepath, 'r') as file:
            content = file.readlines()

        # replace the \\r\\n (the codes are escaped) string with just \\n
            for index, line in enumerate(content):
                content[index] = line.replace('\\r\\n', '\\n')

        # after replacement, store its content
        with open(filepath, 'w') as file:
            file.writelines(content)
        
        print(f'Scrapped content stored at: {filepath}')
# The file already exists and will be read
else:
    print('starwars.json file already exists!')
    print('Information will be read and stored in items dictionary.')
    items = {}
    with open(filepath, 'r') as file:
        items = json.load(file)
    print(f'Scrapped content will be stored at {filepath}')

starwars.json file already exists!
Information will be read and stored in items dictionary.
Scrapped content will be stored at ./data/starwars.json


# Store information
Create a dictionary to store the dataframes from each category

In [8]:
categories_dataframes = dict.fromkeys(categories)

### Generate the dataframes

In [9]:
for cat in categories:
    df = pd.DataFrame(items[cat])
    df['id'] = df.index + 1

    # rename columns to add '_id' to the "fields"
    rename_dict = {field : f'{field}_id' for field in fields[cat]}
    rename_dict.update({'id' : f'{cat}_id'})
    df.rename(columns = rename_dict, inplace = True)

    # reorder the columns to place id in first place
    all_columns_but_cat_id = [col for col in df.columns if col != f'{cat}_id']
    sorted_columns = [f'{cat}_id'] + all_columns_but_cat_id
    categories_dataframes[cat] = df[sorted_columns]
#df.to_csv('./data/starwars_characters.csv', index = False)

Make some renamings to the column names

In [10]:
col_rename_dict= {
    
    'films': {
        'characters_id': 'character_id',
        'films_id': 'film_id',
        'planets_id': 'planet_id',
        'episode_id': 'episode',
        'starships_id': 'starship_id',
        'vehicles_id': 'vehicle_id',
    },
    
    'people': {
        'people_id': 'character_id',
        'films_id': 'film_id',
        'vehicles_id': 'vehicle_id',
        'starships_id': 'starship_id'
    },

    'planets': {
        'planets_id': 'planet_id',
        'films_id': 'film_id'
    },

    'species': {
        'people_id': 'character_id',
        'films_id': 'film_id'
    },

    'vehicles': {
        'pilots_id': 'pilot_id',
        'vehicles_id': 'vehicle_id',
        'films_id': 'film_id'
    },

    'starships': {
        'pilots_id': 'pilot_id',
        'starships_id': 'starship_id',
        'films_id': 'film_id'
    }
}

In [11]:
for cat in categories:
    categories_dataframes[cat].rename(columns=col_rename_dict[cat], inplace=True)

## Clean the datasets

### Clean people

In [12]:
df = categories_dataframes['people']
df.sample(5)

Unnamed: 0,character_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld_id,film_id,species_id,vehicle_id,starship_id,url
7,8,R5-D4,97,32,,"white, red",red,unknown,,1,[1],2,[],[],https://swapi.dev/api/people/8/
20,21,Boba Fett,183,78.2,black,fair,brown,31.5BBY,male,10,"[2, 3, 5]",1,[],[21],https://swapi.dev/api/people/22/
14,15,Greedo,173,74,,green,black,44BBY,male,23,[1],4,[],[],https://swapi.dev/api/people/15/
79,80,Raymus Antilles,188,79,brown,light,brown,unknown,male,2,"[1, 6]",1,[],[],https://swapi.dev/api/people/81/
72,73,Jocasta Nu,167,unknown,white,fair,blue,unknown,female,9,[5],1,[],[],https://swapi.dev/api/people/74/


In [13]:
df.dtypes

character_id     int64
name            object
height          object
mass            object
hair_color      object
skin_color      object
eye_color       object
birth_year      object
gender          object
homeworld_id     int64
film_id         object
species_id       int64
vehicle_id      object
starship_id     object
url             object
dtype: object

In [14]:
df.mass = df.mass.replace('unknown', np.nan)
df.mass = df.mass.str.replace(',', '', regex=False)
df.mass = df.mass.astype('float') 

In [15]:
df.height = df.height.replace('unknown', np.nan).astype('float')

In [16]:
df.hair_color = df.hair_color.replace('n/a', np.nan)

In [17]:
df.birth_year = df.birth_year.str.replace('BBY', ' BBY')

### Clean films

In [18]:
df = categories_dataframes['films']
df.sample(5)

Unnamed: 0,film_id,title,episode,opening_crawl,director,producer,release_date,character_id,planet_id,starship_id,vehicle_id,species_id,url
2,3,Return of the Jedi,6,Luke Skywalker has returned to\nhis home plane...,Richard Marquand,"Howard G. Kazanjian, George Lucas, Rick McCallum",1983-05-25,"[1, 2, 3, 4, 5, 10, 13, 14, 16, 18, 20, 21, 22...","[1, 5, 7, 8, 9]","[2, 3, 10, 11, 12, 15, 17, 22, 23, 27, 28, 29]","[8, 16, 18, 19, 24, 25, 26, 30]",1,https://swapi.dev/api/films/3/
1,2,The Empire Strikes Back,5,It is a dark time for the\nRebellion. Although...,Irvin Kershner,"Gary Kurtz, Rick McCallum",1980-05-17,"[1, 2, 3, 4, 5, 10, 13, 14, 18, 20, 21, 22, 23...","[4, 5, 6, 27]","[3, 10, 11, 12, 15, 17, 21, 22, 23]","[8, 14, 16, 18, 19, 20]",1,https://swapi.dev/api/films/2/
0,1,A New Hope,4,It is a period of civil war.\nRebel spaceships...,George Lucas,"Gary Kurtz, Rick McCallum",1977-05-25,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15...","[1, 2, 3]","[2, 3, 5, 9, 10, 11, 12, 13]","[4, 6, 7, 8]",1,https://swapi.dev/api/films/1/
4,5,Attack of the Clones,2,There is unrest in the Galactic\nSenate. Sever...,George Lucas,Rick McCallum,2002-05-16,"[2, 3, 6, 7, 10, 11, 20, 21, 22, 33, 35, 36, 4...","[1, 8, 9, 10, 11]","[21, 32, 39, 43, 47, 48, 49, 52, 58]","[4, 44, 45, 46, 50, 51, 53, 54, 55, 56, 57]",1,https://swapi.dev/api/films/5/
3,4,The Phantom Menace,1,Turmoil has engulfed the\nGalactic Republic. T...,George Lucas,Rick McCallum,1999-05-19,"[2, 3, 10, 11, 16, 20, 21, 32, 33, 34, 35, 36,...","[1, 8, 9]","[31, 32, 39, 40, 41]","[33, 34, 35, 36, 37, 38, 42]",1,https://swapi.dev/api/films/4/


In [19]:
df.dtypes

film_id           int64
title            object
episode           int64
opening_crawl    object
director         object
producer         object
release_date     object
character_id     object
planet_id        object
starship_id      object
vehicle_id       object
species_id        int64
url              object
dtype: object

In [20]:
df.release_date = pd.to_datetime(df.release_date)

### Clean planets

In [21]:
df = categories_dataframes['planets']
df.sample(5)

Unnamed: 0,planet_id,name,rotation_period,orbital_period,diameter,climate,gravity,terrain,surface_water,population,residents_id,film_id,url
10,11,Geonosis,30,256,11370,"temperate, arid",0.9 standard,"rock, desert, mountain, barren",5,100000000000,[63],[5],https://swapi.dev/api/planets/11/
8,9,Coruscant,24,368,12240,temperate,1 standard,"cityscape, mountains",unknown,1000000000000,"[34, 55, 74]","[3, 4, 5, 6]",https://swapi.dev/api/planets/9/
15,16,Mygeeto,12,167,10088,frigid,1 standard,"glaciers, mountains, ice canyons",unknown,19000000,[],[6],https://swapi.dev/api/planets/16/
40,41,Tund,48,1770,12190,unknown,unknown,"barren, ash",unknown,0,[50],[],https://swapi.dev/api/planets/41/
54,55,Ojom,unknown,unknown,unknown,frigid,unknown,"oceans, glaciers",100,500000000,[71],[],https://swapi.dev/api/planets/55/


In [22]:
df.dtypes

planet_id           int64
name               object
rotation_period    object
orbital_period     object
diameter           object
climate            object
gravity            object
terrain            object
surface_water      object
population         object
residents_id       object
film_id            object
url                object
dtype: object

In [23]:
for col in ['rotation_period', 'orbital_period', 'diameter', 'surface_water', 'population']:
    df[col] = df[col].replace('unknown', np.nan).astype('float')

In [24]:
df.population = df.population / 1E6
df.rename(columns = {'population' : 'population_millions'}, inplace = True)

In [25]:
df.gravity.unique()

array(['1 standard', '1.1 standard', 'N/A',
       '1.5 (surface), 1 standard (Cloud City)', '0.85 standard',
       '0.9 standard', '0.56 standard', '0.75 standard', 'unknown',
       '0.62 standard', '1', '1.56', '0.9', '0.98'], dtype=object)

In [26]:
df.gravity = df.gravity.str.replace(' standard', '').str.replace(df.gravity[5], '1.5')
df.gravity = df.gravity.replace('unknown', np.nan)

## Clean species

In [27]:
df = categories_dataframes['species']
df.sample(5)

Unnamed: 0,species_id,name,classification,designation,average_height,skin_colors,hair_colors,eye_colors,average_lifespan,homeworld_id,language,character_id,film_id,url
13,14,Dug,mammal,sentient,100,"brown, purple, grey, red",none,"yellow, blue",unknown,35,Dugese,[41],[4],https://swapi.dev/api/species/14/
31,32,Kaminoan,amphibian,sentient,220,"grey, blue",none,black,80,10,Kaminoan,"[72, 73]",[5],https://swapi.dev/api/species/32/
19,20,Cerean,mammal,sentient,200,pale pink,"red, blond, black, white",hazel,unknown,43,Cerean,[52],"[4, 6]",https://swapi.dev/api/species/20/
33,34,Muun,mammal,sentient,190,"grey, white",none,black,100,57,Muun,[77],"[5, 6]",https://swapi.dev/api/species/34/
10,11,Neimodian,unknown,sentient,180,"grey, green",none,"red, pink",unknown,18,Neimoidia,[33],[4],https://swapi.dev/api/species/11/


In [28]:
df.dtypes

species_id           int64
name                object
classification      object
designation         object
average_height      object
skin_colors         object
hair_colors         object
eye_colors          object
average_lifespan    object
homeworld_id        object
language            object
character_id        object
film_id             object
url                 object
dtype: object

In [29]:
df.average_height = df.average_height.replace('unknown', np.nan).replace('n/a', np.nan).astype('float')

In [30]:
df.average_lifespan = df.average_lifespan.replace('unknown', np.nan).replace('indefinite', 9999).astype('float')

In [31]:
df.loc[1, 'homeworld_id'] = np.nan
df.homeworld_id = df.homeworld_id.astype('float')

## Clean vehicles

In [32]:
df = categories_dataframes['vehicles']
df.sample(5)

Unnamed: 0,vehicle_id,name,model,manufacturer,cost_in_credits,length,max_atmosphering_speed,crew,passengers,cargo_capacity,consumables,vehicle_class,pilot_id,film_id,url
36,37,Corporate Alliance tank droid,NR-N99 Persuader-class droid enforcer,Techno Union,49000,10.96,100,0,4,none,none,droid tank,[],[6],https://swapi.dev/api/vehicles/72/
9,10,Sail barge,Modified Luxury Sail Barge,Ubrikkian Industries Custom Vehicle Division,285000,30.0,100,26,500,2000000,Live food tanks,sail barge,[],[3],https://swapi.dev/api/vehicles/24/
34,35,Raddaugh Gnasp fluttercraft,Raddaugh Gnasp fluttercraft,Appazanna Engineering Works,14750,7.0,310,2,0,20,none,air speeder,[],[6],https://swapi.dev/api/vehicles/70/
19,20,Sith speeder,FC-20 speeder bike,Razalon,4000,1.5,180,1,0,2,unknown,speeder,[44],[4],https://swapi.dev/api/vehicles/42/
6,7,AT-AT,All Terrain Armored Transport,"Kuat Drive Yards, Imperial Department of Milit...",unknown,20.0,60,5,40,1000,unknown,assault walker,[],"[2, 3]",https://swapi.dev/api/vehicles/18/


In [33]:
df.cost_in_credits = df.cost_in_credits.replace('unknown', np.nan)
df.cost_in_credits = df.cost_in_credits.astype('float')

In [34]:
for col in ['max_atmosphering_speed' ,'crew', 'passengers', 'cargo_capacity']:
    df[col] = df[col].replace('unknown', np.nan).replace('none', np.nan)
    df[col] = df[col].astype('float')

In [35]:
df.length = df.length.replace('unknown', np.nan)
df.length = df.length.astype('float')

In [36]:
df.consumables = df.consumables.replace('0', 'none')

## Clean starships

In [37]:
df = categories_dataframes['starships']
df.sample(5)

Unnamed: 0,starship_id,name,model,manufacturer,cost_in_credits,length,max_atmosphering_speed,crew,passengers,cargo_capacity,consumables,hyperdrive_rating,MGLT,starship_class,pilot_id,film_id,url
12,13,EF76 Nebulon-B escort frigate,EF76 Nebulon-B escort frigate,Kuat Drive Yards,8500000,300.0,800,854,75,6000000,2 years,2.0,40,Escort ship,[],"[2, 3]",https://swapi.dev/api/starships/23/
25,26,Republic Assault ship,Acclamator I-class assault ship,Rothana Heavy Engineering,unknown,752.0,unknown,700,16000,11250000,2 years,0.6,unknown,assault ship,[],[5],https://swapi.dev/api/starships/52/
20,21,Scimitar,Star Courier,Republic Sienar Systems,55000000,26.5,1180,1,6,2500000,30 days,1.5,unknown,Space Transport,[44],[4],https://swapi.dev/api/starships/41/
2,3,Sentinel-class landing craft,Sentinel-class landing craft,"Sienar Fleet Systems, Cyngus Spaceworks",240000,38.0,1000,5,75,180000,1 month,1.0,70,landing craft,[],[1],https://swapi.dev/api/starships/5/
17,18,Droid control ship,Lucrehulk-class Droid Control Ship,"Hoersch-Kessel Drive, Inc.",unknown,3170.0,,175,139000,4000000000,500 days,2.0,unknown,Droid control ship,[],"[4, 5, 6]",https://swapi.dev/api/starships/32/


In [38]:
df.loc[0, 'crew'] = 165

In [39]:
for col in ['cost_in_credits', 'length', 'max_atmosphering_speed', 'crew', 'passengers', 'cargo_capacity', 'hyperdrive_rating', 'MGLT']:
    try:
        df[col] = df[col].replace('unknown', np.nan).replace('none', np.nan).replace('n/a', np.nan)
        df[col] = df[col].str.replace(',', '', regex = False).str.replace('km', '')
        #df[col] = df[col].astype('float')
    except Exception as e:
        print(f'Error in {col}: {e}')

In [40]:
for col in ['cost_in_credits', 'length', 'max_atmosphering_speed', 'crew', 'passengers', 'cargo_capacity', 'hyperdrive_rating', 'MGLT']:
    try:
        df[col] = df[col].astype('float')
    except:
        print(f'error with {col}')

In [41]:
data_path = './data'
for cat in categories:
    filename = f'{cat}_dataframe.csv'
    if os.path.exists(f'{data_path}/csv/{filename}'):
        print(f'File {filename} already exist!')
        pass
    else:
        os.makedirs(f'{data_path}/csv/', exist_ok=True)
        df = categories_dataframes[cat]
        df.to_csv(f'{data_path}/csv/{cat}_dataframe.csv', index = False)
print(f'Dataframes of each normalized category are stored in {data_path}/csv/ as csv files!')

File films_dataframe.csv already exist!
File people_dataframe.csv already exist!
File planets_dataframe.csv already exist!
File species_dataframe.csv already exist!
File starships_dataframe.csv already exist!
File vehicles_dataframe.csv already exist!
Dataframes of each normalized category are stored in ./data/csv/ as csv files!


# Junction tables

(many-to-many relationships in the database)

1. **films_people_junction**: Links films to the characters that appeared in them.

    - character_id: Foreign Key referencing the `people` table.
    - film_id: Foreign Key referencing the films table.

2. **films_planets_junction**: Links films to the planets that appeared in them.

    - planet_id: Foreign Key referencing the `planets` table.
    - film_id: Foreign Key referencing the `films` table.

3. **films_starships_junction**: Links films to the starships that appeared in them.

    - starship_id: Foreign Key referencing the `starships` table.
    - film_id: Foreign Key referencing the `films` table.

4. **films_vehicles_junction**: Links films to the vehicles that appeared in them.

    - vehicle_id: Foreign Key referencing the `vehicles` table.
    - film_id: Foreign Key referencing the `films` table.

5. **films_species_junction**: Links films to the species that appeared in them.

    - species_id: Foreign Key referencing the `species` table.
    - film_id: Foreign Key referencing the `films` table.

6. **people_starships_junction**: Links people (pilots) to the starships they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - starship_id: Foreign Key referencing the `starships` table.

7. **people_vehicles_junction**: Links people (pilots) to the vehicles they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - vehicle_id: Foreign Key referencing the vehicles table.

In [42]:
junction_tables = ['people_films', 'people_vehicles', 'people_starships',
                   'films_starships', 'films_vehicles', 'films_species']

junction_tables = [f'{table}_junction' for table in junction_tables]
junction_tables

['people_films_junction',
 'people_vehicles_junction',
 'people_starships_junction',
 'films_starships_junction',
 'films_vehicles_junction',
 'films_species_junction']

In [43]:
junction_tables_dict = {i:None for i in junction_tables}

## Junction tables for people:

In [44]:
data = categories_dataframes['films'].loc[:, ['species_id', 'film_id', 'vehicle_id', 'starship_id']]

# junction table for people and films
junction_tables_dict['films_species_junction'] = data.explode('species_id').drop(['vehicle_id', 'starship_id'], axis = 1).drop_duplicates()

# junction table for people and vehicles
junction_tables_dict['films_vehicles_junction'] = data.explode('vehicle_id').drop(['species_id', 'starship_id'], axis = 1).drop_duplicates()

# junction table for people and starships
junction_tables_dict['films_starships_junction'] = data.explode('starship_id').drop(['species_id', 'vehicle_id'], axis = 1).drop_duplicates()

## Junction tables for films:

In [45]:
data = categories_dataframes['films'].loc[:, ['character_id', 'film_id', 'vehicle_id', 'starship_id']]

# junction table for people and films
junction_tables_dict['people_films_junction'] = data.explode('character_id').drop(['vehicle_id', 'starship_id'], axis = 1).drop_duplicates()

# junction table for people and vehicles
junction_tables_dict['people_vehicles_junction'] = data.explode('vehicle_id').explode('character_id').drop(['film_id', 'starship_id'], axis = 1).drop_duplicates()

# junction table for people and starships
junction_tables_dict['people_starships_junction'] = data.explode('starship_id').explode('character_id').drop(['film_id', 'vehicle_id'], axis = 1).drop_duplicates()

In [46]:
junction_tables_dict['people_starships_junction'].head()

Unnamed: 0,character_id,starship_id
0,1,2
0,2,2
0,3,2
0,4,2
0,5,2


# Normalization
Next step is to normalize the datasets in order to create the database.

In [47]:
categories_dataframes_normalized = categories_dataframes.copy()

In [48]:
columns_to_drop = {
    'films' : ['character_id', 'planet_id', 'species_id', 'vehicle_id', 'starship_id', 'url'],
    'people' : ['film_id', 'species_id', 'vehicle_id', 'starship_id', 'url'],
    'planets' : ['residents_id', 'film_id', 'url'],
    'species' : ['character_id', 'film_id', 'url'],
    'starships' : ['pilot_id', 'film_id', 'url'],
    'vehicles' : ['pilot_id', 'film_id', 'url'],
}  

### Drop the corresponding columns in order to normalize the tables

In [49]:
for cat in categories_dataframes_normalized.keys():
    categories_dataframes_normalized[cat].drop(columns_to_drop[cat], axis='columns', inplace = True)

## Store the normalized dataframes

In [50]:
data_path = './data'
for cat in categories:
    filename = f'{cat}_dataframe.csv'
    if os.path.exists(f'{data_path}/csv_normalized/{filename}'):
        print(f'File {filename} already exist!')
        pass
    else:
        os.makedirs(f'{data_path}/csv_normalized/', exist_ok=True)
        df = categories_dataframes_normalized[cat]
        df.to_csv(f'{data_path}/csv_normalized/{cat}_dataframe_normalized.csv', index = False)
print(f'Dataframes of each normalized category are stored in {data_path}/csv_normalized/ as csv files!')

Dataframes of each normalized category are stored in ./data/csv_normalized/ as csv files!


## Example of joined people and their vehicles

In [51]:
# df2 = pd.merge(people_vehicles_junction, categories_dataframes['people'], on='people_id', how = 'inner')
# df2.rename(columns={'vehicles_id_x' : 'vehicles_id'}, inplace=True)

# df2 = pd.merge(df2, categories_dataframes['vehicles'], on='vehicles_id')
# #df2.drop(['people_id', 'vehicles_id'], axis = 1)
# df2.head()

In [52]:
# categories_dataframes['people']

# Insert data into the database

## Load database parameters from `.env` file

In [53]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

## Create the db connection

In [54]:
connection_string = (
    f'mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
)

# --- 4. Create the SQLAlchemy Engine ---
try:
    engine = create_engine(connection_string)
    print("SQLAlchemy Engine created successfully. 🛠️")
except Exception as e:
    print(f"Error creating engine: {e}")

SQLAlchemy Engine created successfully. 🛠️


## Populate the data into the database

The order of tables to be filled must be:
1. planets
2. species
3. starships
4. vehicles
5. films
6. people

In [55]:
categories_sorted = ['planets', 'species', 'vehicles', 'starships', 'films', 'people']

In [56]:
for cat in categories_sorted:
    df = categories_dataframes_normalized[cat]
    try:
        df.to_sql(name=cat, con=engine, if_exists='append', index=False)
        print(f"DataFrame for category '{cat}' inserted successfully into the database. ✅\n\n")
    except Exception as e:
        print(f"\\ Error inserting DataFrame for category '{cat}': \n{e}\n\n")

DataFrame for category 'planets' inserted successfully into the database. ✅


DataFrame for category 'species' inserted successfully into the database. ✅


DataFrame for category 'vehicles' inserted successfully into the database. ✅


DataFrame for category 'starships' inserted successfully into the database. ✅


DataFrame for category 'films' inserted successfully into the database. ✅


DataFrame for category 'people' inserted successfully into the database. ✅




In [57]:
# junction_tables_dict['people_vehicles_junction']

In [58]:
for table, df in junction_tables_dict.items():
    
    try:
        df.to_sql(name=table, con=engine, if_exists='append', index=False)
        print(f"DataFrame for category '{table}' inserted successfully into the database. ✅\n\n")
    except Exception as e:
        print(f"\\ Error inserting DataFrame for category '{table}': \n{e}\n\n")

DataFrame for category 'people_films_junction' inserted successfully into the database. ✅


\ Error inserting DataFrame for category 'people_vehicles_junction': 
(pymysql.err.IntegrityError) (1452, 'Cannot add or update a child row: a foreign key constraint fails (`starwars_db`.`people_vehicles_junction`, CONSTRAINT `people_vehicles_junction_ibfk_2` FOREIGN KEY (`vehicle_id`) REFERENCES `vehicles` (`vehicle_id`))')
[SQL: INSERT INTO people_vehicles_junction (character_id, vehicle_id) VALUES (%(character_id)s, %(vehicle_id)s)]
[parameters: [{'character_id': 1, 'vehicle_id': 4}, {'character_id': 2, 'vehicle_id': 4}, {'character_id': 3, 'vehicle_id': 4}, {'character_id': 4, 'vehicle_id': 4}, {'character_id': 5, 'vehicle_id': 4}, {'character_id': 6, 'vehicle_id': 4}, {'character_id': 7, 'vehicle_id': 4}, {'character_id': 8, 'vehicle_id': 4}  ... displaying 10 of 1299 total bound parameter sets ...  {'character_id': 82, 'vehicle_id': 76}, {'character_id': 83, 'vehicle_id': 76}]]
(Background

In [59]:
df = junction_tables_dict['people_vehicles_junction']
df[(df.character_id == 1) & (df.vehicle_id == 8)]

Unnamed: 0,character_id,vehicle_id
0,1,8
