In [1]:
import requests as rq
import pandas as pd
import os
import numpy as np

In [None]:
# base_url = 'https://swapi.dev/api/people/'

In [2]:
# data = rq.get(f'{base_url}10/').json()
# character_fields = data.keys()
# print(*character_fields, sep='\n')

In [3]:
# rq.get(f'{base_url}10/').json()

In [None]:
def get_characters(url):
        
    # get the content of the url
    response = rq.get(url)

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{url} not found!')
        return
    
    characters_list = []

    next = content['next']
    characters = content['results']

    for char in characters:
    
        related_fields = ['homeworld', 'starships', 'species', 'vehicles', 'starships']

        for field in related_fields:
            id_values = []
            try:
                
                if char[field]:  # if the field is not empty
                    # parse the links from starships, vehicles and starships
                    if field != 'homeworld':  
                        for link in char[field]:
                            # parse the id value in the link    
                            id_values.append(int(link.split('/')[-2]))
                        # add the id values into the corresponding field key
                        char[field] = id_values
                    
                    # parse the homeworld (just a single string value)
                    else:
                        # get the homeworld id
                        char['homeworld'] = int(char['homeworld'].split('/')[-2])
                
                # parse species field
                # in case of human characters, the species field is an empty list
                elif field == 'species' and not char[field]:
                    char[field] = 'human'
            
            except:
                print(f"{field} doesn't exist in character url: {base_url}/{id}")
                # if there's a missing field, don't consume the API link for this character
                break
                
        # remove created and edited fields
        try:
            del(char['created'])
            del(char['edited'])
        except:
            pass
        
        characters_list.append(char)

    return next, characters_list

In [3]:
if not os.path.exists('./data/starwars_characters.csv'):
    char_list = []
    url= 'https://swapi.dev/api/people'
    while url:
        url, characters = get_characters(url)
        char_list.extend(characters)
    
    # store the characters in a csv file
    df = pd.DataFrame(char_list, index = range(1,len(char_list)+1))
    df.reset_index(inplace = True)
    df.rename(columns = {'index' : 'id'}, inplace = True)
    df.to_csv('./data/starwars_characters.csv', index = False)

In [4]:
df2 = pd.read_csv('./data/starwars_characters.csv')

In [18]:
df2.dtypes

id              int64
name           object
height        float64
mass          float64
hair_color     object
skin_color     object
eye_color      object
birth_year     object
gender         object
homeworld       int64
films          object
species        object
vehicles       object
starships      object
url            object
dtype: object

In [14]:
df2.sample(3)

Unnamed: 0,id,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,films,species,vehicles,starships,url
52,53,Eeth Koth,171,unknown,black,brown,brown,unknown,male,45,"[4, 6]",[22],[],[],https://swapi.dev/api/people/54/
8,9,Biggs Darklighter,183,84,black,light,brown,24BBY,male,1,[1],human,[],[12],https://swapi.dev/api/people/9/
37,38,Ric Olié,183,unknown,brown,fair,blue,unknown,male,8,[4],human,[],[40],https://swapi.dev/api/people/39/


Clean the mass and height columns

In [17]:
for col in ['mass', 'height']:
    df2[col] = df2[col].replace('unknown', np.nan).str.replace(',','').astype(float)

## Create many-to-many table using `explode()`

In [None]:
char_film_table = df2[['id', 'starships']]
char_film_table.rename(columns= {'id' : 'character_id'}, inplace= True)
char_film_table.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  char_film_table.rename(columns= {'id' : 'character_id'}, inplace= True)


Unnamed: 0,character_id,films
0,1,"[1, 2, 3, 6]"
1,2,"[1, 2, 3, 4, 5, 6]"
2,3,"[1, 2, 3, 4, 5, 6]"
3,4,"[1, 2, 3, 6]"
4,5,"[1, 2, 3, 6]"


In [None]:
starships_list = []
# iterate the rows of the dataframe
for index, row in char_film_table.iterrows():
    starships_string = row['starships'][1:len(row['starships'])-1]
    starships_list.append( [int(i) for i in starships_string.split(',')] )

In [None]:
# assign the series with lists
char_film_table.loc[:, 'starships'] = pd.Series(starships_list)

In [None]:
# explode using the starships column
char_film_table = char_film_table.explode('starships')
char_film_table.head()

Unnamed: 0,character_id,films
0,1,1
0,1,2
0,1,3
0,1,6
1,2,1


Export the character-starships junction table

In [None]:
char_film_table.to_csv('./data/ char_film_junction.csv', index = False)

Starships junction

In [83]:
char_starship_junction = df2[['id', 'starships']].rename(columns={'id' : 'character_id'})
char_starship_junction.head()

Unnamed: 0,character_id,starships
0,1,"[12, 22]"
1,2,[]
2,3,[]
3,4,[13]
4,5,[]


In [84]:
starships_list = []
# iterate the rows of the dataframe
for index, row in char_starship_junction.iterrows():
    starships_string = row['starships'][1:len(row['starships'])-1]
    if starships_string == '':
        starships_list.append([np.nan])
    else:
        starships_list.append( [int(i) for i in starships_string.split(',')] )

In [85]:
# assign the series with lists
char_starship_junction.loc[:, 'starships'] = pd.Series(starships_list)

In [86]:
# explode using the starships column
char_starship_junction = char_starship_junction.explode('starships')
char_starship_junction.head()

Unnamed: 0,character_id,starships
0,1,12.0
0,1,22.0
1,2,
2,3,
3,4,13.0


In [87]:
char_starship_junction.to_csv('./data/ char_starship_junction.csv', index = False)

In [None]:
# a list of dictionaries with the character info
character_list = []
id = 1
while True:
    # the character page with id 17 doesn't exist!
    if id == 17:
        id += 1
        pass
    char = get_character(id)
    if char:
        character_list.append(char)
        id += 1
    else:
        print(f'\nInformation from {id - 1} characters has been extracted.')
        break

https://swapi.dev/api/people/84 not found!
Information from 83 characters has been extracted.


In [None]:
# a list of dictionaries with the character info
character_list = []
id = 1
while True:
    # the character page with id 17 doesn't exist!
    if id == 17:
        id += 1
        pass
    char = get_character(id)
    if char:
        character_list.append(char)
        id += 1
    else:
        print(f'\nInformation from {id - 1} characters has been extracted.')
        break

https://swapi.dev/api/people/84 not found!
Information from 83 characters has been extracted.


In [None]:
# a list of dictionaries with the character info
character_list = []
id = 1
while True:
    # the character page with id 17 doesn't exist!
    if id == 17:
        id += 1
        pass
    char = get_character(id)
    if char:
        character_list.append(char)
        id += 1
    else:
        print(f'\nInformation from {id - 1} characters has been extracted.')
        break

https://swapi.dev/api/people/84 not found!
Information from 83 characters has been extracted.


In [27]:
character_list[:2]

[{'name': 'Luke Skywalker',
  'height': '172',
  'mass': '77',
  'hair_color': 'blond',
  'skin_color': 'fair',
  'eye_color': 'blue',
  'birth_year': '19BBY',
  'gender': 'male',
  'homeworld': 1,
  'films': [1, 2, 3, 6],
  'species': 'human',
  'vehicles': [14, 30],
  'starships': [12, 22],
  'url': 'https://swapi.dev/api/people/1/'},
 {'name': 'C-3PO',
  'height': '167',
  'mass': '75',
  'hair_color': 'n/a',
  'skin_color': 'gold',
  'eye_color': 'yellow',
  'birth_year': '112BBY',
  'gender': 'n/a',
  'homeworld': 1,
  'films': [1, 2, 3, 4, 5, 6],
  'species': [2],
  'vehicles': [],
  'starships': [],
  'url': 'https://swapi.dev/api/people/2/'}]

In [None]:
{
    "name": "unknown", 
    "rotation_period": "0", 
    "orbital_period": "0", 
    "diameter": "0", 
    "climate": "unknown", 
    "gravity": "unknown", 
    "terrain": "unknown", 
    "surface_water": "unknown", 
    "population": "unknown", 
    "residents": [
        "https://swapi.dev/api/people/20/", 
        "https://swapi.dev/api/people/23/", 
        "https://swapi.dev/api/people/29/", 
        "https://swapi.dev/api/people/32/", 
        "https://swapi.dev/api/people/75/"
    ], 
    "starships": [], 
    "created": "2014-12-15T12:25:59.569000Z", 
    "edited": "2014-12-20T20:58:18.466000Z", 
    "url": "https://swapi.dev/api/planets/28/"
}

In [None]:
def get_planet(id):
    base_url = 'https://swapi.dev/api/planets'
    # get the url
    response = rq.get(f'{base_url}/{id}')

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{base_url}/{id} not found!')
        return

    if content['name'] == 'unknown':
        return
    
    for field in ['starships', 'residents']:
        # if the field is not empty
        if content[field]:
            id_values = []
            for link in content[field]:
                # parse the id value in the link    
                id_values.append(int(link.split('/')[-2]))
            content[field] = id_values
             
    # remove created and edited fields
    try:
        del(content['created'])
        del(content['edited'])
    except:
        pass
    
    return content

In [36]:
# a list of dictionaries with the character info
planets_list = []
id = 1
while True:
    # the character page with id 17 doesn't exist!
    # if id == 17:
    #     id += 1
    #     pass
    planet = get_planet(id)
    if planet:
        planets_list.append(planet)
        id += 1
    else:
        print(f'\nInformation from {id} planets has been extracted.')
        break


Information from 28 planets has been extracted.


In [38]:
planets_list[:2]

[{'name': 'Tatooine',
  'rotation_period': '23',
  'orbital_period': '304',
  'diameter': '10465',
  'climate': 'arid',
  'gravity': '1 standard',
  'terrain': 'desert',
  'surface_water': '1',
  'population': '200000',
  'residents': [1, 2, 4, 6, 7, 8, 9, 11, 43, 62],
  'films': [1, 3, 4, 5, 6],
  'url': 'https://swapi.dev/api/planets/1/'},
 {'name': 'Alderaan',
  'rotation_period': '24',
  'orbital_period': '364',
  'diameter': '12500',
  'climate': 'temperate',
  'gravity': '1 standard',
  'terrain': 'grasslands, mountains',
  'surface_water': '40',
  'population': '2000000000',
  'residents': [5, 68, 81],
  'films': [1, 6],
  'url': 'https://swapi.dev/api/planets/2/'}]

In [None]:
print(*rq.get('https://swapi.dev/api/starships/1').json().keys(), sep='\n')

title
episode_id
opening_crawl
director
producer
release_date
characters
planets
starships
vehicles
species
created
edited
url


In [None]:
def get_film(id):
    base_url = 'https://swapi.dev/api/starships'
    # get the url
    response = rq.get(f'{base_url}/{id}')

    # success
    if response.status_code == 200:
        content = response.json()
    elif response.status_code == 404:
        print(f'{base_url}/{id} not found!')
        return None
    
    for field in ['characters', 'planets', 'starships', 'vehicles', 'species']:
        # if the field is not empty
        if content[field]:
            id_values = []
            for link in content[field]:
                # parse the id value in the link    
                id_values.append(int(link.split('/')[-2]))
            content[field] = id_values
             
    # remove created and edited fields
    try:
        del(content['created'])
        del(content['edited'])
    except:
        pass
    
    return content

In [None]:
# a list of dictionaries with the character info
starships_list = []
id = 1
while True:
    # the character page with id 17 doesn't exist!
    # if id == 17:
    #     id += 1
    #     pass
    film = get_film(id)
    if film:
        starships_list.append(film)
        id += 1
    else:
        print(f'\nInformation from {id-1} planets has been extracted.')
        break

https://swapi.dev/api/films/7 not found!

Information from 6 planets has been extracted.


In [19]:
for index, value in enumerate(characters_list):
    if 'detail' in value.keys():
        print(f'Details found in character with id: {index + 1}')

Details found in character with id: 17


Remove character with id 17. It's a valid url!

In [27]:
del characters_list[16]

In [28]:
df = pd.DataFrame(characters_list)

In [30]:
df.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,films,species,vehicles,starships,url
0,Luke Skywalker,172,77,blond,fair,blue,19BBY,male,1,"[1, 2, 3, 6]",human,"[14, 30]","[12, 22]",https://swapi.dev/api/people/1/
1,C-3PO,167,75,,gold,yellow,112BBY,,1,"[1, 2, 3, 4, 5, 6]",[2],[],[],https://swapi.dev/api/people/2/
2,R2-D2,96,32,,"white, blue",red,33BBY,,8,"[1, 2, 3, 4, 5, 6]",[2],[],[],https://swapi.dev/api/people/3/
3,Darth Vader,202,136,none,white,yellow,41.9BBY,male,1,"[1, 2, 3, 6]",human,[],[13],https://swapi.dev/api/people/4/
4,Leia Organa,150,49,brown,light,brown,19BBY,female,2,"[1, 2, 3, 6]",human,[30],[],https://swapi.dev/api/people/5/


In [31]:
df.shape

(81, 14)

In [32]:
df.to_csv('sw_characters.csv')