# Data Visualization with Modern Data Science

> Assignment 5

Yao-Jen Kuo <yaojenkuo@ntu.edu.tw> from [DATAINPOINT](https://www.datainpoint.com)

In [None]:
import unittest
import json
import requests
import numpy as np
import pandas as pd

## Instructions

- Write down your solution between comments `### BEGIN SOLUTION` and `### END SOLUTION`.
- Running tests to see if your solutions are right:
    - Runtime -> Restart and run all.
- When you are ready to submit, click File -> Download -> Download `.py`.

![](https://i.imgur.com/Y1BcDdx.png)

- Open a new Colab in a private window, upload the script and run tests again before submission to make sure the script is executable in a fresh new Colab.

![](https://i.imgur.com/ojlvbds.png)

- Upload to the Assignment session on NTU COOL.

## In the following exercises, you will need some files to complete the functions. Run the cell below to download those files at your working directory.

In [None]:
file_names = ["teams.json", "all_time_olympic_medals.csv", "movies.csv"]
for file_name in file_names:
    file_url = f"https://raw.githubusercontent.com/datainpoint/asgmts-data-viz-with-modern-ds-2023/main/{file_name}"
    r = requests.get(file_url)
    with open(file_name , 'wb') as f:
        f.write(r.content)

## 01. Define a function `import_all_time_olympic_medals()` which imports `all_time_olympic_medals.csv` in working directory.

In [None]:
def import_all_time_olympic_medals() -> pd.core.frame.DataFrame:
    """
    >>> all_time_olympic_medals = import_all_time_olympic_medals()
    >>> type(all_time_olympic_medals)
    pandas.core.frame.DataFrame
    >>> all_time_olympic_medals.shape
    (157, 17)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 02. Define a function `find_taiwan_from_all_time_olympic_medals()` which retrieves the data of Taiwan given `all_time_olympic_medals.csv` in working directory.

```
          team_name team_ioc  no_summer_games  no_summer_golds  \
131  Chinese Taipei      TPE               15                7   

     no_summer_silvers  no_summer_bronzes  no_summer_totals  no_winter_games  \
131                 11                 18                36               12   

     no_winter_golds  no_winter_silvers  no_winter_bronzes  no_winter_totals  \
131                0                  0                  0                 0   

     no_combined_games  no_combined_golds  no_combined_silvers  \
131                 27                  7                   11   

     no_combined_bronzes no_combined_totals  
131                   18                 36 
```

In [None]:
def find_taiwan_from_all_time_olympic_medals() -> pd.core.frame.DataFrame:
    """
    >>> taiwan_from_all_time_olympic_medals = find_taiwan_from_all_time_olympic_medals()
    >>> type(taiwan_from_all_time_olympic_medals)
    pandas.core.frame.DataFrame
    >>> taiwan_from_all_time_olympic_medals.shape
    (1, 17)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 03. Define a function `find_the_king_of_summer_olympics()` which retrieves the data of the country that won the most gold medals in summer Olympics, given `all_time_olympic_medals.csv` in working directory.

Hint: Exclude the data of "Totals" at the last row of `all_time_olympic_medals.csv`.

In [None]:
def find_the_king_of_summer_olympics() -> pd.core.frame.DataFrame:
    """
    >>> the_king_of_summer_olympics = find_the_king_of_summer_olympics()
    >>> type(the_king_of_summer_olympics)
    pandas.core.frame.DataFrame
    >>> the_king_of_summer_olympics.shape
    (1, 17)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 04. Define a function `find_the_king_of_winter_olympics()` which retrieves the data of the country that won the most gold medals in winter Olympics, given `all_time_olympic_medals.csv` in working directory.

Hint: Exclude the data of "Totals" at the last row of `all_time_olympic_medals.csv`.

In [None]:
def find_the_king_of_winter_olympics() -> pd.core.frame.DataFrame:
    """
    >>> the_king_of_winter_olympics = find_the_king_of_winter_olympics()
    >>> type(the_king_of_winter_olympics)
    pandas.core.frame.DataFrame
    >>> the_king_of_winter_olympics.shape
    (1, 17)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 05. Define a function `calculate_summer_winter_medals_ratio()` which calculates the ratio according to the formula below, given `all_time_olympic_medals.csv` in working directory.

\begin{equation}
\text{Ratio} = \frac{\text{Summer Golds} - \text{Winter Golds}}{\text{Total Golds}}
\end{equation}

Hint: Exclude the data of "Totals" at the last row of `all_time_olympic_medals.csv`.

```
                            team_name  no_summer_golds  no_winter_golds  ratio
0                         Afghanistan                0                0    NaN
1                             Algeria                5                0    1.0
2                           Argentina               21                0    1.0
3                             Armenia                2                0    1.0
4                         Australasia                3                0    1.0
..                                ...              ...              ...    ...
151                            Zambia                0                0    NaN
152                          Zimbabwe                3                0    1.0
153      Independent Olympic Athletes                1                0    1.0
154  Independent Olympic Participants                0                0    NaN
155                        Mixed team               11                0    1.0

[156 rows x 4 columns]
```

In [None]:
def calculate_summer_winter_medals_ratio() -> pd.core.frame.DataFrame:
    """
    >>> summer_winter_medals_ratio = calculate_summer_winter_medals_ratio()
    >>> type(summer_winter_medals_ratio)
    pandas.core.frame.DataFrame
    >>> summer_winter_medals_ratio.shape
    (156, 4)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 06. Define a function `find_the_largest_summer_winter_medals_ratio()` which retrieves the country with the largest ratio calculated in the previous exercises, given `all_time_olympic_medals.csv` in working directory.

Hint: Exclude the data of "Totals" at the last row of `all_time_olympic_medals.csv`. And also exclude those countries with `ratio == 1.0` or `ratio is np.NaN`.

In [None]:
def find_the_largest_summer_winter_medals_ratio() -> pd.core.frame.DataFrame:
    """
    >>> the_largest_summer_winter_medals_ratio = find_the_largest_summer_winter_medals_ratio()
    >>> type(the_largest_summer_winter_medals_ratio)
    pandas.core.frame.DataFrame
    >>> the_largest_summer_winter_medals_ratio.shape
    (1, 4)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 07. Define a function `create_nba_teams()` which creates a DataFrame of NBA franchise teams in standard league given nested key `["league"]["standard"]` of `teams.json` in working directory.

```
                  fullName tricode confName    divName           city
0            Atlanta Hawks     ATL     East  Southeast        Atlanta
1           Boston Celtics     BOS     East   Atlantic         Boston
2            Brooklyn Nets     BKN     East   Atlantic       Brooklyn
3        Charlotte Hornets     CHA     East  Southeast      Charlotte
4            Chicago Bulls     CHI     East    Central        Chicago
5      Cleveland Cavaliers     CLE     East    Central      Cleveland
6         Dallas Mavericks     DAL     West  Southwest         Dallas
7           Denver Nuggets     DEN     West  Northwest         Denver
8          Detroit Pistons     DET     East    Central        Detroit
9    Golden State Warriors     GSW     West    Pacific   Golden State
10         Houston Rockets     HOU     West  Southwest        Houston
11          Indiana Pacers     IND     East    Central        Indiana
12             LA Clippers     LAC     West    Pacific             LA
13      Los Angeles Lakers     LAL     West    Pacific    Los Angeles
14       Memphis Grizzlies     MEM     West  Southwest        Memphis
15              Miami Heat     MIA     East  Southeast          Miami
16         Milwaukee Bucks     MIL     East    Central      Milwaukee
17  Minnesota Timberwolves     MIN     West  Northwest      Minnesota
18    New Orleans Pelicans     NOP     West  Southwest    New Orleans
19         New York Knicks     NYK     East   Atlantic       New York
20   Oklahoma City Thunder     OKC     West  Northwest  Oklahoma City
21           Orlando Magic     ORL     East  Southeast        Orlando
22      Philadelphia 76ers     PHI     East   Atlantic   Philadelphia
23            Phoenix Suns     PHX     West    Pacific        Phoenix
24  Portland Trail Blazers     POR     West  Northwest       Portland
25        Sacramento Kings     SAC     West    Pacific     Sacramento
26       San Antonio Spurs     SAS     West  Southwest    San Antonio
27         Toronto Raptors     TOR     East   Atlantic        Toronto
28               Utah Jazz     UTA     West  Northwest           Utah
29      Washington Wizards     WAS     East  Southeast     Washington
```

In [None]:
def create_nba_teams() -> pd.core.frame.DataFrame:
    """
    >>> nba_teams = create_nba_teams()
    >>> type(nba_teams)
    pandas.core.frame.DataFrame
    >>> nba_teams.shape
    (30, 5)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 08. Define a function `subset_nba_teams()` which subsets teams whose `tricode` is not the first 3 letters of its city in upper-cased form.

```
   tricode           city
2      BKN       Brooklyn
9      GSW   Golden State
12     LAC             LA
13     LAL    Los Angeles
18     NOP    New Orleans
19     NYK       New York
20     OKC  Oklahoma City
23     PHX        Phoenix
26     SAS    San Antonio
```

In [None]:
def subset_nba_teams() -> pd.core.frame.DataFrame:
    """
    >>> nba_teams_subset = subset_nba_teams()
    >>> type(nba_teams_subset)
    pandas.core.frame.DataFrame
    >>> nba_teams_subset.shape
    (9, 2)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 09. Define a function `create_movies()` which creates a DataFrame of IMDb's top 250 rated movies of all time given `movies.csv` in working directory.

```
      id                     title  release_year  rating  runtime  director_id
0      1  The Shawshank Redemption          1994     9.2      142           43
1      2             The Godfather          1972     9.2      175           41
2      3           The Dark Knight          2008     9.0      152           20
3      4     The Godfather Part II          1974     9.0      202           41
4      5              12 Angry Men          1957     9.0       96          130
..   ...                       ...           ...     ...      ...          ...
245  246            The Iron Giant          1999     8.0       86           12
246  247                  The Help          2011     8.0      146          138
247  248                   Aladdin          1992     8.0       90          123
248  249               Dersu Uzala          1975     8.0      142            3
249  250        Dances with Wolves          1990     8.0      181           80

[250 rows x 6 columns]
```

In [None]:
def create_movies() -> pd.core.frame.DataFrame:
    """
    >>> movies = create_movies()
    >>> type(movies)
    pandas.core.frame.DataFrame
    >>> movies.shape
    (250, 6)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 10. Define a function named `find_starwars_episodes()` which finds out "Star Wars" episodes given `movies.csv` in working directory.

```
    id                                           title  release_year  rating  \
14  15  Star Wars: Episode V - The Empire Strikes Back          1980     8.7   
27  28              Star Wars: Episode IV - A New Hope          1977     8.5   
88  89      Star Wars: Episode VI - Return of the Jedi          1983     8.3   

    runtime  director_id  
14      124           60  
27      121           47  
88      131          114
```

In [None]:
def find_starwars_episodes() -> pd.core.frame.DataFrame:
    """
    >>> starwars_episodes = find_starwars_episodes()
    >>> type(starwars_episodes)
    pandas.core.frame.DataFrame
    >>> starwars_episodes.shape
    (3, 6)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## Running tests

Assignment session is finished, click Runtime -> Restart and run all to run the following tests.

In [None]:
class TestAssignmentFive(unittest.TestCase):
    def test_01_import_all_time_olympic_medals(self):
        all_time_olympic_medals = import_all_time_olympic_medals()
        self.assertIsInstance(all_time_olympic_medals, pd.core.frame.DataFrame)
        self.assertEqual(all_time_olympic_medals.shape, (157, 17))
    def test_02_find_taiwan_from_all_time_olympic_medals(self):
        taiwan_from_all_time_olympic_medals = find_taiwan_from_all_time_olympic_medals()
        self.assertIsInstance(taiwan_from_all_time_olympic_medals, pd.core.frame.DataFrame)
        self.assertEqual(taiwan_from_all_time_olympic_medals.shape, (1, 17))
        self.assertEqual(taiwan_from_all_time_olympic_medals["team_ioc"].values[0], "TPE")
    def test_03_find_the_king_of_summer_olympics(self):
        the_king_of_summer_olympics = find_the_king_of_summer_olympics()
        self.assertIsInstance(the_king_of_summer_olympics, pd.core.frame.DataFrame)
        self.assertEqual(the_king_of_summer_olympics.shape, (1, 17))
        self.assertEqual(the_king_of_summer_olympics["team_ioc"].values[0], "USA")
    def test_04_find_the_king_of_winter_olympics(self):
        the_king_of_winter_olympics = find_the_king_of_winter_olympics()
        self.assertIsInstance(the_king_of_winter_olympics, pd.core.frame.DataFrame)
        self.assertEqual(the_king_of_winter_olympics.shape, (1, 17))
        self.assertEqual(the_king_of_winter_olympics["team_name"].values[0], "Norway")
    def test_05_calculate_summer_winter_medals_ratio(self):
        summer_winter_medals_ratio = calculate_summer_winter_medals_ratio()
        self.assertIsInstance(summer_winter_medals_ratio, pd.core.frame.DataFrame)
        self.assertEqual(summer_winter_medals_ratio.shape, (156, 4))
    def test_06_find_the_largest_summer_winter_medals_ratio(self):
        the_largest_summer_winter_medals_ratio = find_the_largest_summer_winter_medals_ratio()
        self.assertIsInstance(the_largest_summer_winter_medals_ratio, pd.core.frame.DataFrame)
        self.assertEqual(the_largest_summer_winter_medals_ratio.shape, (1, 4))
        self.assertEqual(the_largest_summer_winter_medals_ratio["team_name"].values[0], "Hungary")
    def test_07_create_nba_teams(self):
        nba_teams = create_nba_teams()
        self.assertIsInstance(nba_teams, pd.core.frame.DataFrame)
        self.assertEqual(nba_teams.shape, (30, 5))
        self.assertEqual(nba_teams["tricode"].nunique(), 30)
        self.assertEqual(nba_teams["confName"].nunique(), 2)
        self.assertEqual(nba_teams["divName"].nunique(), 6)
    def test_08_subset_nba_teams(self):
        nba_teams_subset = subset_nba_teams()
        self.assertIsInstance(nba_teams_subset, pd.core.frame.DataFrame)
        self.assertEqual(nba_teams_subset.shape, (9, 2))
    def test_09_create_movies(self):
        movies = create_movies()
        self.assertIsInstance(movies, pd.core.frame.DataFrame)
        self.assertEqual(movies.shape, (250, 6))
    def test_10_find_starwars_episodes(self):
        starwars_episodes = find_starwars_episodes()
        self.assertIsInstance(starwars_episodes, pd.core.frame.DataFrame)
        self.assertEqual(starwars_episodes.shape, (3, 6))

suite = unittest.TestLoader().loadTestsFromTestCase(TestAssignmentFive)
runner = unittest.TextTestRunner(verbosity=2)
test_results = runner.run(suite)
number_of_failures = len(test_results.failures)
number_of_errors = len(test_results.errors)
number_of_test_runs = test_results.testsRun
number_of_successes = number_of_test_runs - (number_of_failures + number_of_errors)

In [None]:
print("You've got {} successes among {} questions.".format(number_of_successes, number_of_test_runs))