In [1]:
import requests as rq
import pandas as pd
import os
import numpy as np
import json
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
def show_cols(dict_):
    for cat in dict_.keys():
        print(f'\n\nCategory: {cat}')
        print(*[ i for i in dict_[cat].columns], sep = '\n')

In [3]:
base_urls = {
    "films": "https://swapi.dev/api/films/",
    "people": "https://swapi.dev/api/people/",
    "planets": "https://swapi.dev/api/planets/",
    "species": "https://swapi.dev/api/species/",
    "starships": "https://swapi.dev/api/starships/",
    "vehicles": "https://swapi.dev/api/vehicles/"
}

In [4]:
categories = list(base_urls.keys())
categories

['films', 'people', 'planets', 'species', 'starships', 'vehicles']

In [5]:
fields = {
    'people' : ["homeworld", "films", "species", "vehicles", "starships"],
    'planets' : ['residents', 'films'],
    'films' : ["characters", "planets", "starships", "vehicles", "species"],
    'species' : ['people', 'films'],
    'vehicles' : ['pilots', 'films'],
    'starships' : ['pilots', 'films']
    }

In [6]:
def get_page_items(url, fields):
        
    # get the content of the url
    response = rq.get(url)

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{url} not found!')
        return
    
    items_list = []

    next = content['next']
    items = content['results']

    for item in items:

        for field in fields:
            id_values = []
                           
            if item[field]:  # if the field is not empty
                # parse the links from starships, vehicles and starships
                if field != 'homeworld':  
                    for link in item[field]:
                        # parse the id value in the link    
                        id_values.append(int(link.split('/')[-2]))
                    # add the id values into the corresponding field key
                    # convert list into tuple, as tuples are hashable
                    # each character belongs to only 1 species
                    if field != 'species':
                        item[field] = tuple(id_values)
                    else:
                        item[field] = id_values[0]
                        
                # parse the homeworld (just a single string value)
                else:
                    # get the homeworld id
                    item['homeworld'] = int(item['homeworld'].split('/')[-2])
            
            # parse species field
            # in case of human characters, the species field is an empty list
            elif field == 'species' and not item[field]:
                item[field] = 1
            
            # field has no values (empty list)
            else:
                item[field] = ()
                  
        # remove created and edited fields
        try:
            del(item['created'])
            del(item['edited'])
        except:
            pass
        
        items_list.append(item)

    return next, items_list

Scrape all the information from the Star Wars API, for all the available categories

In [7]:
filepath = './data/starwars.json'
if not os.path.exists(filepath):
    items = dict.fromkeys(categories)

    for category in categories:
        items_list = []
        url = base_urls[category]
        category_fields = fields[category]
        while url:
            url, page_items = get_page_items(url, category_fields)
            items_list.extend(page_items)
        
        items[category] = items_list

        print(f'{category} successfully scrapped!')

    print('\n\nWhole database fully scrapped!')
    print('\nNow the information will be stored in a json file...')

    # store the information in a json file
    if not os.path.exists(filepath):
        with open(filepath, 'w') as file:
            json.dump(items, file, indent=4)

        # remove the carriage return character
        with open(filepath, 'r') as file:
            content = file.readlines()

        # replace the \\r\\n (the codes are escaped) string with just \\n
            for index, line in enumerate(content):
                content[index] = line.replace('\\r\\n', '\\n')

        # after replacement, store its content
        with open(filepath, 'w') as file:
            file.writelines(content)
        
        print(f'Scrapped content stored at: {filepath}')
# The file already exists and will be read
else:
    print('starwars.json file already exists!')
    print('Information will be read and stored in items dictionary.')
    items = {}
    with open(filepath, 'r') as file:
        items = json.load(file)
    print(f'Scrapped content will be stored at {filepath}')

starwars.json file already exists!
Information will be read and stored in items dictionary.
Scrapped content will be stored at ./data/starwars.json


## Store the dataframes from each category in a dictionary

In [8]:
categories_dataframes = dict.fromkeys(categories)

### Generate the dataframes

In [9]:
for cat in categories:
    df = pd.DataFrame(items[cat])
    df['id'] = df.index + 1

    # rename columns to add '_id' to the "fields"
    rename_dict = {field : f'{field}_id' for field in fields[cat]}
    rename_dict.update({'id' : f'{cat}_id'})
    df.rename(columns = rename_dict, inplace = True)

    # reorder the columns to place id in first place
    all_columns_but_cat_id = [col for col in df.columns if col != f'{cat}_id']
    sorted_columns = [f'{cat}_id'] + all_columns_but_cat_id
    categories_dataframes[cat] = df[sorted_columns]
#df.to_csv('./data/starwars_characters.csv', index = False)

Make some renamings to the column names

In [10]:
col_rename_dict= {
    
    'films': {
        'characters_id': 'character_id',
        'films_id': 'film_id',
        'planets_id': 'planet_id',
        'episode_id': 'episode',
        'starships_id': 'starship_id',
        'vehicles_id': 'vehicle_id',
    },
    
    'people': {
        'people_id': 'character_id',
        'films_id': 'film_id',
        'vehicles_id': 'vehicle_id',
        'starships_id': 'starship_id'
    },

    'planets': {
        'planets_id': 'planet_id',
        'films_id': 'film_id'
    },

    'species': {
        'people_id': 'character_id',
        'films_id': 'film_id'
    },

    'vehicles': {
        'pilots_id': 'pilot_id',
        'vehicles_id': 'vehicle_id',
        'films_id': 'film_id'
    },

    'starships': {
        'pilots_id': 'pilot_id',
        'starships_id': 'starship_id',
        'films_id': 'film_id'
    }
}

In [11]:
for cat in categories:
    categories_dataframes[cat].rename(columns=col_rename_dict[cat], inplace=True)

## Junction tables

(many-to-many relationships in the database)

1. **films_people_junction**: Links films to the characters that appeared in them.

    - character_id: Foreign Key referencing the `people` table.
    - film_id: Foreign Key referencing the films table.

2. **films_planets_junction**: Links films to the planets that appeared in them.

    - planet_id: Foreign Key referencing the `planets` table.
    - film_id: Foreign Key referencing the `films` table.

3. **films_starships_junction**: Links films to the starships that appeared in them.

    - starship_id: Foreign Key referencing the `starships` table.
    - film_id: Foreign Key referencing the `films` table.

4. **films_vehicles_junction**: Links films to the vehicles that appeared in them.

    - vehicle_id: Foreign Key referencing the `vehicles` table.
    - film_id: Foreign Key referencing the `films` table.

5. **films_species_junction**: Links films to the species that appeared in them.

    - species_id: Foreign Key referencing the `species` table.
    - film_id: Foreign Key referencing the `films` table.

6. **people_starships_junction**: Links people (pilots) to the starships they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - starship_id: Foreign Key referencing the `starships` table.

7. **people_vehicles_junction**: Links people (pilots) to the vehicles they have piloted.

    - character_id: Foreign Key referencing the `people` table.
    - vehicle_id: Foreign Key referencing the vehicles table.

In [12]:
junction_tables = ['people_films', 'people_vehicles', 'people_starships',
                   'films_starships', 'films_vehicles', 'films_species']

junction_tables = [f'{table}_junction_table' for table in junction_tables]
junction_tables

['people_films_junction_table',
 'people_vehicles_junction_table',
 'people_starships_junction_table',
 'films_starships_junction_table',
 'films_vehicles_junction_table',
 'films_species_junction_table']

Junction tables for people:

In [13]:
data = categories_dataframes['films'].loc[:, ['species_id', 'film_id', 'vehicle_id', 'starship_id']]

# junction table for people and films
film_species_junction = data.explode('species_id').drop(['vehicle_id', 'starship_id'], axis = 1)

# junction table for people and vehicles
film_vehicles_junction = data.explode('vehicle_id').drop(['species_id', 'starship_id'], axis = 1)

# junction table for people and starships
film_starships_junction = data.explode('starship_id').drop(['species_id', 'vehicle_id'], axis = 1)

Junction tables for films:

In [14]:
data = categories_dataframes['films'].loc[:, ['character_id', 'film_id', 'vehicle_id', 'starship_id']]

# junction table for people and films
people_film_junction = data.explode('film_id').drop(['vehicle_id', 'starship_id'], axis = 1)

# junction table for people and vehicles
people_vehicles_junction = data.explode('vehicle_id').drop(['film_id', 'starship_id'], axis = 1)

# junction table for people and starships
people_starships_junction = data.explode('starship_id').drop(['film_id', 'vehicle_id'], axis = 1)

## Normalization
Next step is to normalize the datasets in order to create the database.

In [15]:
categories_dataframes_normalized = categories_dataframes.copy()

In [16]:
columns_to_drop = {
    'films' : ['character_id', 'planet_id', 'species_id', 'vehicle_id', 'starship_id', 'url'],
    'people' : ['film_id', 'species_id', 'vehicle_id', 'starship_id', 'url'],
    'planets' : ['residents_id', 'film_id', 'url'],
    'species' : ['character_id', 'film_id', 'url'],
    'starships' : ['pilot_id', 'film_id', 'url'],
    'vehicles' : ['pilot_id', 'film_id', 'url'],
}  

### Drop the corresponding columns in order to normalize the tables

In [17]:
for cat in categories_dataframes_normalized.keys():
    categories_dataframes_normalized[cat].drop(columns_to_drop[cat], axis='columns', inplace = True)

### Store the normalized dataframes

In [18]:
data_path = './data'
for cat in categories:
    filename = f'{cat}_dataframe.csv'
    if os.path.exists(f'{data_path}/csv_normalized/{filename}'):
        print(f'File {filename} already exist!')
        pass
    else:
        os.makedirs(f'{data_path}/csv_normalized/', exist_ok=True)
        df = categories_dataframes_normalized[cat]
        df.to_csv(f'{data_path}/csv_normalized/{cat}_dataframe_normalized.csv', index = False)
print(f'Dataframes of each normalized category are stored in {data_path}/csv_normalized/ as csv files!')

Dataframes of each normalized category are stored in ./data/csv_normalized/ as csv files!


## Example of joined people and their vehicles

In [19]:
# df2 = pd.merge(people_vehicles_junction, categories_dataframes['people'], on='people_id', how = 'inner')
# df2.rename(columns={'vehicles_id_x' : 'vehicles_id'}, inplace=True)

# df2 = pd.merge(df2, categories_dataframes['vehicles'], on='vehicles_id')
# #df2.drop(['people_id', 'vehicles_id'], axis = 1)
# df2.head()

In [20]:
# categories_dataframes['people']

# Insert data into the database

In [21]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

In [22]:
connection_string = (
    f'mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
)

# --- 4. Create the SQLAlchemy Engine ---
try:
    engine = create_engine(connection_string)
    print("SQLAlchemy Engine created successfully. üõ†Ô∏è")
except Exception as e:
    print(f"Error creating engine: {e}")

SQLAlchemy Engine created successfully. üõ†Ô∏è


Populate the data into the database
The order of tables to be filled must be:
1. planets
2. species
3. starships
4. vehicles
5. films
6. people

In [23]:
categories_sorted = ['planets', 'species', 'vehicles', 'starships', 'films', 'people']

In [25]:
categories_dataframes_normalized['species'].head()

Unnamed: 0,species_id,name,classification,designation,average_height,skin_colors,hair_colors,eye_colors,average_lifespan,homeworld,language
0,1,Human,mammal,sentient,180.0,"caucasian, black, asian, hispanic","blonde, brown, black, red","brown, blue, green, hazel, grey, amber",120,https://swapi.dev/api/planets/9/,Galactic Basic
1,2,Droid,artificial,sentient,,,,,indefinite,,
2,3,Wookie,mammal,sentient,210.0,gray,"black, brown","blue, green, yellow, brown, golden, red",400,https://swapi.dev/api/planets/14/,Shyriiwook
3,4,Rodian,sentient,reptilian,170.0,"green, blue",,black,unknown,https://swapi.dev/api/planets/23/,Galatic Basic
4,5,Hutt,gastropod,sentient,300.0,"green, brown, tan",,"yellow, red",1000,https://swapi.dev/api/planets/24/,Huttese


In [24]:
for cat in categories_sorted:
    df = categories_dataframes_normalized[cat]
    try:
        df.to_sql(name=cat, con=engine, if_exists='append', index=False)
        print(f"DataFrame for category '{cat}' inserted successfully into the database. ‚úÖ")
    except Exception as e:
        print(f"Error inserting DataFrame for category '{cat}': {e}")

DataFrame for category 'planets' inserted successfully into the database. ‚úÖ
Error inserting DataFrame for category 'species': (pymysql.err.OperationalError) (1054, "Unknown column 'homeworld' in 'field list'")
[SQL: INSERT INTO species (species_id, name, classification, designation, average_height, skin_colors, hair_colors, eye_colors, average_lifespan, homeworld, language) VALUES (%(species_id)s, %(name)s, %(classification)s, %(designation)s, %(average_height)s, %(skin_colors)s, %(hair_colors)s, %(eye_colors)s, %(average_lifespan)s, %(homeworld)s, %(language)s)]
[parameters: [{'species_id': 1, 'name': 'Human', 'classification': 'mammal', 'designation': 'sentient', 'average_height': '180', 'skin_colors': 'caucasian, black, asian, hispanic', 'hair_colors': 'blonde, brown, black, red', 'eye_colors': 'brown, blue, green, hazel, grey, amber', 'average_lifespan': '120', 'homeworld': 'https://swapi.dev/api/planets/9/', 'language': 'Galactic Basic'}, {'species_id': 2, 'name': 'Droid', 'clas

In [None]:
people.to_sql(name='people', con=engine, if_exists='append', index=False)

IntegrityError: (pymysql.err.IntegrityError) (1452, 'Cannot add or update a child row: a foreign key constraint fails (`starwars_db`.`people`, CONSTRAINT `people_ibfk_1` FOREIGN KEY (`homeworld_id`) REFERENCES `planets` (`planet_id`))')
[SQL: INSERT INTO people (people_id, name, height, mass, hair_color, skin_color, eye_color, birth_year, gender, homeworld_id, species_id) VALUES (%(people_id)s, %(name)s, %(height)s, %(mass)s, %(hair_color)s, %(skin_color)s, %(eye_color)s, %(birth_year)s, %(gender)s, %(homeworld_id)s, %(species_id)s)]
[parameters: [{'people_id': 1, 'name': 'Luke Skywalker', 'height': '172', 'mass': '77', 'hair_color': 'blond', 'skin_color': 'fair', 'eye_color': 'blue', 'birth_year': '19BBY', 'gender': 'male', 'homeworld_id': 1, 'species_id': 1}, {'people_id': 2, 'name': 'C-3PO', 'height': '167', 'mass': '75', 'hair_color': 'n/a', 'skin_color': 'gold', 'eye_color': 'yellow', 'birth_year': '112BBY', 'gender': 'n/a', 'homeworld_id': 1, 'species_id': 2}, {'people_id': 3, 'name': 'R2-D2', 'height': '96', 'mass': '32', 'hair_color': 'n/a', 'skin_color': 'white, blue', 'eye_color': 'red', 'birth_year': '33BBY', 'gender': 'n/a', 'homeworld_id': 8, 'species_id': 2}, {'people_id': 4, 'name': 'Darth Vader', 'height': '202', 'mass': '136', 'hair_color': 'none', 'skin_color': 'white', 'eye_color': 'yellow', 'birth_year': '41.9BBY', 'gender': 'male', 'homeworld_id': 1, 'species_id': 1}, {'people_id': 5, 'name': 'Leia Organa', 'height': '150', 'mass': '49', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'brown', 'birth_year': '19BBY', 'gender': 'female', 'homeworld_id': 2, 'species_id': 1}, {'people_id': 6, 'name': 'Owen Lars', 'height': '178', 'mass': '120', 'hair_color': 'brown, grey', 'skin_color': 'light', 'eye_color': 'blue', 'birth_year': '52BBY', 'gender': 'male', 'homeworld_id': 1, 'species_id': 1}, {'people_id': 7, 'name': 'Beru Whitesun lars', 'height': '165', 'mass': '75', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'blue', 'birth_year': '47BBY', 'gender': 'female', 'homeworld_id': 1, 'species_id': 1}, {'people_id': 8, 'name': 'R5-D4', 'height': '97', 'mass': '32', 'hair_color': 'n/a', 'skin_color': 'white, red', 'eye_color': 'red', 'birth_year': 'unknown', 'gender': 'n/a', 'homeworld_id': 1, 'species_id': 2}  ... displaying 10 of 82 total bound parameter sets ...  {'people_id': 81, 'name': 'Sly Moore', 'height': '178', 'mass': '48', 'hair_color': 'none', 'skin_color': 'pale', 'eye_color': 'white', 'birth_year': 'unknown', 'gender': 'female', 'homeworld_id': 60, 'species_id': 1}, {'people_id': 82, 'name': 'Tion Medon', 'height': '206', 'mass': '80', 'hair_color': 'none', 'skin_color': 'grey', 'eye_color': 'black', 'birth_year': 'unknown', 'gender': 'male', 'homeworld_id': 12, 'species_id': 37}]]
(Background on this error at: https://sqlalche.me/e/20/gkpj)