## 0. Understanding Data


In [1]:
pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import required modules
import sqlite3
import pandas as pd

In [3]:
# create a connection 
conn = sqlite3.connect('starwars.db')

In [4]:
# take a look at the stored tables
tables = ['people', 'films', 'starships', 'vehicles', 'species', 'planets']

for table in tables:
    display(table)
    display(pd.read_sql(f'SELECT * FROM {table} LIMIT 1', conn))

'people'

Unnamed: 0,id,name,birth_year,eye_color,gender,hair_color,height,mass,skin_color
0,1,Luke Skywalker,19BBY,blue,male,blond,172,77,fair


'films'

Unnamed: 0,id,title,episode_id,opening_crawl,director,producer,release_date
0,1,A New Hope,4,It is a period of civil war.\r\nRebel spaceshi...,George Lucas,"Gary Kurtz, Rick McCallum",1977-05-25


'starships'

Unnamed: 0,id,name,model,starship_class,manufacturer,cost_in_credits,length,crew,passengers,max_atmosphering_speed,hyperdrive_rating,MGLT,cargo_capacity,consumables
0,2,CR90 corvette,CR90 corvette,corvette,Corellian Engineering Corporation,3500000,150,30-165,600,950,2.0,60,3000000,1 year


'vehicles'

Unnamed: 0,id,name,model,vehicle_class,manufacturer,length,cost_in_credits,crew,passengers,max_atmosphering_speed,cargo_capacity,consumables
0,4,Sand Crawler,Digger Crawler,wheeled,Corellia Mining Corporation,36.8,150000,46,30,30,50000,2 months


'species'

Unnamed: 0,id,name,average_height,average_lifespan,classification,designation,eye_colors,hair_colors,homeworld,language,skin_colors
0,1,Human,180,120,mammal,sentient,"brown, blue, green, hazel, grey, amber","blonde, brown, black, red",https://swapi.dev/api/planets/9/,Galactic Basic,"caucasian, black, asian, hispanic"


'planets'

Unnamed: 0,id,name,diameter,rotation_period,orbital_period,gravity,population,climate,terrain,surface_water
0,1,Tatooine,10465,23,304,1 standard,200000,arid,desert,1


In [5]:
connection_table = ["people_films", "people_species","people_starships","people_vehicles", "people_planets"]
for item in connection_table:
    display(item)
    display(pd.read_sql(f'SELECT * FROM {item} LIMIT 1', conn))
    

'people_films'

Unnamed: 0,person_id,film_id
0,1,1


'people_species'

Unnamed: 0,person_id,specie_id
0,1,1


'people_starships'

Unnamed: 0,person_id,starship_id
0,1,12


'people_vehicles'

Unnamed: 0,person_id,vehicle_id
0,1,14


'people_planets'

Unnamed: 0,person_id,planet_id
0,1,1


## 1. Required SQL analysis

### 1.1 What’s the average height and mass for characters appear in each film
- Requirement: `1 request with INNER`
- Explanation: By using INNER JOIN, we can return the occurrences of characters in films. Since each character must appear in at least one film, it makes sense to use INNER JOIN here.

In [6]:
pd.read_sql(
    '''

    select 
        f.title as film_title,
        round(avg(p.height),2) as average_height,
        round(avg(p.mass),2) as average_mass
    from 
        people p
    inner join
        people_films pf on p.id = pf.person_id
    inner join
        films f on pf.film_id = f.id
    group by
        f.id,
        f.title

    ''', 
    conn
)

Unnamed: 0,film_title,average_height,average_mass
0,A New Hope,170.33,71.67
1,The Empire Strikes Back,169.25,81.01
2,Return of the Jedi,153.8,56.81
3,The Phantom Menace,169.74,41.65
4,Attack of the Clones,174.95,51.26
5,Revenge of the Sith,177.76,66.98


### 1.2 Who has the highest occurence in terms of being a pilot
- Requirment: `1 request with LEFT`
- Explanation: By using LEFT JOIN, we can return all people, regardless of whether they are pilots of vehicles or starships. This approach ensures that we include everyone in the results, not just those who have piloted vehicles or starships.

In [7]:
pd.read_sql(
    '''
    with people_starships_table as (
        select 
            p.id,
            p.name,
            count(p.name) as starships_occurence,
            group_concat(s.name, ", ") as starship_names
        from 
            people p
        left join 
            people_starships ps on p.id = ps.person_id
        left join
            starships s on ps.starship_id = s.id
        group by
            p.id,
            p.name
    ),

    people_vehicles_table as (
        select 
            p.id,
            p.name,
            count(p.name) as vehicle_occurence,
            group_concat(v.name, ", ") as vehicle_names
        from 
            people p
        left join 
            people_vehicles pv on p.id = pv.person_id
        left join
            vehicles v on pv.vehicle_id = v.id
        group by
            p.id,
            p.name
    )

    select 
        p.name as name,
        (ifnull(ps.starships_occurence, 0) + ifnull(pv.vehicle_occurence, 0)) as total_occurence,
        ps.starships_occurence,
        ps.starship_names,
        pv.vehicle_occurence,
        pv.vehicle_names
    from 
        people p
    left join
        people_starships_table ps on p.id = ps.id
    left join 
        people_vehicles_table pv on p.id = pv.id
    group by
        p.name
    order by
        total_occurence desc
    limit 5


    ''', 
    conn
)

Unnamed: 0,name,total_occurence,starships_occurence,starship_names,vehicle_occurence,vehicle_names
0,Palpatine,10,5,"CR90 corvette, Star Destroyer, Sentinel-class ...",5,"Sand Crawler, T-16 skyhopper"
1,Padmé Amidala,6,3,"Naboo fighter, H-type Nubian yacht, Naboo star...",3,"Sand Crawler, T-16 skyhopper"
2,Owen Lars,6,3,Sentinel-class landing craft,3,T-16 skyhopper
3,Obi-Wan Kenobi,6,5,"Jedi starfighter, Trade Federation cruiser, Na...",1,Tribubble bongo
4,Mace Windu,6,3,Sentinel-class landing craft,3,"Sand Crawler, T-16 skyhopper"


### 1.3 1.3 Who are the pilots that could operate the fastest model starships?
- Requirement: 1 request with GROUP BY
- Explanation: By using GROUP BY, we can categorize the pilots based on the starship. We also categorize starship model speed. This allows us to identify and analyze the pilots capable of operating the starships with the highest speeds within each category or model.

In [8]:
pd.read_sql(
    '''
    select 
        p.name as pilot_name,
        s.model as model,
        cast(avg(s.max_atmosphering_speed) as int) as average_speed
    from 
        people p
    left join 
        people_starships ps on p.id = ps.person_id
    left join
        starships s on ps.starship_id = s.id
    group by
        p.name,
        s.model
    order by
        average_speed desc
    limit 5

    ''', 
    conn
)

Unnamed: 0,pilot_name,model,average_speed
0,Padmé Amidala,H-type Nubian yacht,8000
1,Anakin Skywalker,Eta-2 Actis-class light interceptor,1500
2,Obi-Wan Kenobi,Eta-2 Actis-class light interceptor,1500
3,Arvel Crynyd,RZ-1 A-wing Interceptor,1300
4,Poggle the Lesser,RZ-1 A-wing Interceptor,1300


### 1.4 Find out characters who appear in each movie
- Requirement: `1 request with HAVING`
- Explanation: By grouping the data by characters and vehicles, we could use HAVING to filter the occurrence

In [9]:
pd.read_sql(
    '''
    select 
        p.name,
        count(distinct f.id) as occurence    
    from 
        people p
    join 
        people_films pf
    on 
        p.id = pf.person_id
    join 
        films f
    on 
        pf.film_id = f.id
    group by 
        p.id,
        p.name
    having 
       count(distinct f.id) = (select count(*) from films)
    ''', 
    conn
)

Unnamed: 0,name,occurence
0,C-3PO,6
1,R2-D2,6
2,Obi-Wan Kenobi,6


## 1.5 Find out the oldest person in the Star Wars
- Requirment: `1 request with CTE`

Before implementing queries, we have checked no one is born ABY.

In [10]:
pd.read_sql(
    '''
        select
            count(*) as aby_count
        from
            people 
        where birth_year like '%ABY';
    ''', 
    conn
)

Unnamed: 0,aby_count
0,0


By creating a temporary table, we conver the string `19BBY`, 19 years before the battle of yavin, to 19. And we write another query to return the oldest person's name and age.

In [11]:
pd.read_sql(
    '''
        with cleaned_birthyear as (
            select 
                name,
                cast(replace(birth_year, 'BBY', '') as integer) as birth_year_bby
            from 
                people
            where 
                birth_year != "unknown"
        )

        select 
            name,
            max(birth_year_bby) as birth_year
        from
            cleaned_birthyear
    ''', 
    conn
)

Unnamed: 0,name,birth_year
0,Yoda,896
