In [24]:
import requests as rq
import pandas as pd
import os
import numpy as np
import json

In [7]:
base_urls = {
    "films": "https://swapi.dev/api/films/",
    "people": "https://swapi.dev/api/people/",
    "planets": "https://swapi.dev/api/planets/",
    "species": "https://swapi.dev/api/species/",
    "starships": "https://swapi.dev/api/starships/",
    "vehicles": "https://swapi.dev/api/vehicles/"
}

In [48]:
categories = list(base_urls.keys())
categories

['films', 'people', 'planets', 'species', 'starships', 'vehicles']

In [8]:
fields = {
    'people' : ["homeworld", "films", "species", "vehicles", "starships"],
    'planets' : ['residents', 'films'],
    'films' : ["characters", "planets", "starships", "vehicles", "species"],
    'species' : ['people', 'films'],
    'vehicles' : ['pilots', 'films'],
    'starships' : ['pilots', 'films']
    }

In [66]:
def get_page_items(url, fields):
        
    # get the content of the url
    response = rq.get(url)

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{url} not found!')
        return
    
    items_list = []

    next = content['next']
    items = content['results']

    for item in items:

        for field in fields:
            id_values = []
                           
            if item[field]:  # if the field is not empty
                # parse the links from starships, vehicles and starships
                if field != 'homeworld':  
                    for link in item[field]:
                        # parse the id value in the link    
                        id_values.append(int(link.split('/')[-2]))
                    # add the id values into the corresponding field key
                    # convert list into tuple, as tuples are hashable
                    item[field] = tuple(id_values)
                
                # parse the homeworld (just a single string value)
                else:
                    # get the homeworld id
                    item['homeworld'] = int(item['homeworld'].split('/')[-2])
            
            # parse species field
            # in case of human characters, the species field is an empty list
            elif field == 'species' and not item[field]:
                item[field] = (1)
            
            # field has no values (empty list)
            else:
                item[field] = ()
                  
        # remove created and edited fields
        try:
            del(item['created'])
            del(item['edited'])
        except:
            pass
        
        items_list.append(item)

    return next, items_list

Scrape all the information from the Star Wars API, for all the available categories

In [74]:
if not os.path.exists('./data/starwars.json'):
    items = dict.fromkeys(categories)

    for category in categories:
        items_list = []
        url = base_urls[category]
        category_fields = fields[category]
        while url:
            url, page_items = get_page_items(url, category_fields)
            items_list.extend(page_items)
        
        items[category] = items_list

        print(f'{category} successfully scrapped!')

    print('\n\nWhole database fully scrapped!')

else:
    print('starwars.json file already exists!')

starwars.json file already exists!


## The scrapped information is stored in dictionary **items**

Store the information in a JSON file

In [73]:
if not os.path.exists('./data/starwars.json'):
    with open('./data/starwars.json', 'w') as file:
        json.dump(items, file, indent=4)

    # remove the carriage return character
    with open('./data/starwars.json', 'r') as file:
        content = file.readlines()

    # replace the \\r\\n (the codes are escaped) string with just \\n
        for index, line in enumerate(content):
            content[index] = line.replace('\\r\\n', '\\n')

    with open('./data/starwars.json', 'w') as file:
        file.writelines(content)
else:
    print('starwars.json file already exists!')

starwars.json file already exists!


In [31]:
content = {}
with open('starwars.json', 'r') as file:
    content = json.load(file)

## Store the dataframes from each category in a dictionary

In [75]:
categories_dataframes = dict.fromkeys(categories)

### Generate the dataframes

In [76]:
for cat in categories:
    df = pd.DataFrame(items[cat])
    df['id'] = df.index + 1
    df.rename(columns = {'id' : '{cat}_id'}, inplace = True)
    categories_dataframes[cat] = df
#df.to_csv('./data/starwars_characters.csv', index = False)

### Process the dataframes:
fields with lists of values are stored as strings. I need to convert them back into a list 

In [None]:
# def str_to_list(column):
#     values = []
#     # iterate the rows of the dataframe
#     for index, row in column.iterrows():
#         value_string = row[column][1:len(row[column])-1]
#         values.append( [int(i) for i in value_string.split(',')] )
#     # return a pandas Series with the list of values for selected column
#     return pd.Series(values)

In [78]:
# for cat in categories:
#     category_fields = fields[cat] 
#     df = categories_dataframes[cat]
#     print(f'{cat}: {category_fields}')
#     for field in category_fields:
#         print(field)
#         try:
#             print(df.loc[:, field].unique())
#         except Exception as e:
#             print(f'{field} with error: {e}')
#     print('\n\n')

## Junction tables
(many-to-many relationships in the database)

1. people_films: Links people to the films they appeared in.

    - person_id: Foreign Key referencing the people table.
    - film_id: Foreign Key referencing the films table.

2. people_species: Links people to their species.

    - person_id: Foreign Key referencing the people table.
   - species_id: Foreign Key referencing the species table.

3. people_vehicles: Links people to the vehicles they have piloted.

    - person_id: Foreign Key referencing the people table.
    - vehicle_id: Foreign Key referencing the vehicles table.

4. people_starships: Links people to the starships they have piloted.

    - person_id: Foreign Key referencing the people table.
    - starship_id: Foreign Key referencing the starships table.