# Data Visualization with Modern Data Science

> Assignment 6

Yao-Jen Kuo <yaojenkuo@ntu.edu.tw> from [DATAINPOINT](https://www.datainpoint.com)

In [None]:
import unittest
import numpy as np
import pandas as pd

## Instructions

- Write down your solution between comments `### BEGIN SOLUTION` and `### END SOLUTION`.
- Running tests to see if your solutions are right:
    - Runtime -> Restart and run all.
- When you are ready to submit, click File -> Download -> Download `.py`.

![](https://i.imgur.com/Y1BcDdx.png)

- Open a new Colab in a private window, upload the script and run tests again before submission to make sure the script is executable in a fresh new Colab.

![](https://i.imgur.com/ojlvbds.png)

- Upload to the Assignment session on NTU COOL.

In [None]:
file_names = ["UID_ISO_FIPS_LookUp_Table.csv", "02-28-2023.csv", "imdb_top250.csv"]
for file_name in file_names:
    file_url = f"https://raw.githubusercontent.com/datainpoint/data-viz-with-modern-ds-2023/main/{file_name}"
    r = requests.get(file_url)
    with open(file_name , 'wb') as f:
        f.write(r.content)

## 01. Define a Python function `import_lookup_table_and_daily_report()` which imports `UID_ISO_FIPS_LookUp_Table.csv` and `02-28-2023.csv` in working directory.

In [None]:
def import_lookup_table_and_daily_report() -> tuple:
    """
    >>> lookup_table, daily_report = import_lookup_table_and_daily_report()
    >>> type(lookup_table)
    pandas.core.frame.DataFrame
    >>> type(daily_report)
    pandas.core.frame.DataFrame
    >>> lookup_table.shape
    (4321, 12)
    >>> daily_report.shape
    (4016, 14)
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 02. Define a Python function `inner_join_lookup_table_and_daily_report()` which inner joins `UID_ISO_FIPS_LookUp_Table.csv` and `02-28-2023.csv` on `Combined_Key` in working directory. Select specified columns from the joined result.

In [None]:
def inner_join_lookup_table_and_daily_report() -> pd.core.frame.DataFrame:
    """
    >>> lookup_table_and_daily_report = inner_join_lookup_table_and_daily_report()
    >>> type(lookup_table_and_daily_report)
    pandas.core.frame.DataFrame
    >>> lookup_table_and_daily_report.shape
    (4014, 9)
    >>> print(lookup_table_and_daily_report)
              Admin2 Province_State Country_Region             Combined_Key   
    0            NaN            NaN    Afghanistan              Afghanistan  \
    1            NaN            NaN        Albania                  Albania   
    2            NaN            NaN     Antarctica               Antarctica   
    3            NaN            NaN        Algeria                  Algeria   
    4            NaN            NaN        Andorra                  Andorra   
    ...          ...            ...            ...                      ...   
    4009  Sweetwater        Wyoming             US  Sweetwater, Wyoming, US   
    4010       Teton        Wyoming             US       Teton, Wyoming, US   
    4011       Uinta        Wyoming             US       Uinta, Wyoming, US   
    4012    Washakie        Wyoming             US    Washakie, Wyoming, US   
    4013      Weston        Wyoming             US      Weston, Wyoming, US   

          Confirmed  Deaths  Population  Incident_Rate  Case_Fatality_Ratio  
    0        209322    7896  38928341.0     537.711073             3.772179  
    1        334391    3598   2877800.0   11619.674752             1.075986  
    2            11       0         NaN            NaN             0.000000  
    3        271441    6881  43851043.0     619.006941             2.534989  
    4         47866     165     77265.0   61950.430337             0.344712  
    ...         ...     ...         ...            ...                  ...  
    4009      12499     139     42343.0   29518.456415             1.112089  
    4010      12130      16     23464.0   51696.215479             0.131904  
    4011       6401      43     20226.0   31647.384555             0.671770  
    4012       2750      50      7805.0   35233.824471             1.818182  
    4013       1905      23      6927.0   27501.082720             1.207349  

    [4014 rows x 9 columns]
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 03. Define a Python function `create_combined_keys()` which replicate the column `Combined_Key` but concatenating `Country_Region`, `Province_State`, and `Admin2` with different order and a different separator `-` from the output of `inner_join_lookup_table_and_daily_report()`.

In [None]:
def create_combined_keys() -> pd.core.series.Series:
    """
    >>> combined_keys = create_combined_keys()
    >>> type(combined_keys)
    pandas.core.series.Series
    >>> combined_keys.shape
    (4014,)
    >>> combined_keys
    0                 Afghanistan
    1                     Albania
    2                  Antarctica
    3                     Algeria
    4                     Andorra
                    ...          
    4009    US-Wyoming-Sweetwater
    4010         US-Wyoming-Teton
    4011         US-Wyoming-Uinta
    4012      US-Wyoming-Washakie
    4013        US-Wyoming-Weston
    Name: Country_Region, Length: 4014, dtype: object
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 04. Define a Python function `calculate_incident_rate_by_country_region()` which replicate the column `Incident_Rate` from the output of `inner_join_lookup_table_and_daily_report()` but calculate in country-level.

\begin{equation}
\text{Incident Rate} = \frac{\text{Confirmed}}{\text{Population}} \times 100000
\end{equation}

PS. Exclude data with 0 population.

In [None]:
def calculate_incident_rate_by_country_region() -> pd.core.series.Series:
    """
    >>> incident_rate_by_country_region = calculate_incident_rate_by_country_region()
    >>> type(incident_rate_by_country_region)
    pandas.core.series.Series
    >>> incident_rate_by_country_region.shape
    (196,)
    >>> incident_rate_by_country_region
    Country_Region
    Afghanistan             537.711073
    Albania               11619.674752
    Algeria                 619.006941
    Andorra               61950.430337
    Angola                  320.252363
                              ...     
    Vietnam               11842.084243
    West Bank and Gaza    13784.956961
    Yemen                    40.048994
    Zambia                 1865.822568
    Zimbabwe               1775.700035
    Length: 196, dtype: float64
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 05. Define a Python function `find_countries_incident_rate()` which retrieves the data given a `country_list` from the output of `calculate_incident_rate_by_country_region()`.

```python
country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
```

In [None]:
def find_countries_incident_rate() -> pd.core.series.Series:
    """
    >>> countries_incident_rate = find_countries_incident_rate()
    >>> type(countries_incident_rate)
    pandas.core.series.Series
    >>> countries_incident_rate.shape
    (11,)
    >>> countries_incident_rate
    Country_Region
    Canada            12034.670299
    Japan             26335.922781
    US                31128.127922
    United Kingdom    36640.306538
    Singapore         38055.837068
    Taiwan*           41865.185358
    Australia         44644.520556
    Germany           45900.900452
    New Zealand       45951.308542
    France            58459.281558
    Korea, South      59540.664028
    dtype: float64
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 06. Define a Python function `calculate_case_fatality_ratio_by_country_region()` which replicate the column `Case_Fatality_Ratio` from the output of `inner_join_lookup_table_and_daily_report()` but calculate in country-level.

\begin{equation}
\text{Case Fatality Ratio} = \frac{\text{Deaths}}{\text{Confirmed}} \times 100
\end{equation}

In [None]:
def calculate_case_fatality_ratio_by_country_region() -> pd.core.series.Series:
    """
    >>> case_fatality_ratio_by_country_region = calculate_case_fatality_ratio_by_country_region()
    >>> type(case_fatality_ratio_by_country_region)
    pandas.core.series.Series
    >>> case_fatality_ratio_by_country_region.shape
    (201,)
    >>> case_fatality_ratio_by_country_region
    Country_Region
    Afghanistan              3.772179
    Albania                  1.075986
    Algeria                  2.534989
    Andorra                  0.344712
    Angola                   1.836492
                              ...    
    West Bank and Gaza       0.811686
    Winter Olympics 2022     0.000000
    Yemen                   18.074508
    Zambia                   1.182757
    Zimbabwe                 2.145718
    Length: 201, dtype: float64
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 07. Define a Python function named `unify_countries_ir_and_cfr()` which unifies the incident rate and case fatality given a `country_list`.

```python
country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
```

In [None]:
def unify_countries_ir_and_cfr() -> pd.core.frame.DataFrame:
    """
    >>> countries_ir_and_cfr = unify_countries_ir_and_cfr()
    >>> type(countries_ir_and_cfr)
    pandas.core.frame.DataFrame
    >>> countries_ir_and_cfr.shape
    (11, 3)
    >>> print(countries_ir_and_cfr)
        Country_Region  Incident_Rate  Case_Fatality_Ratio
    0        Singapore   38055.837068             0.077345
    1     Korea, South   59540.664028             0.111341
    2      New Zealand   45951.308542             0.114355
    3        Australia   44644.520556             0.170442
    4          Taiwan*   41865.185358             0.177235
    5            Japan   26335.922781             0.217879
    6           France   58459.281558             0.416811
    7          Germany   45900.900452             0.440374
    8   United Kingdom   36640.306538             0.897117
    9               US   31128.127922             1.082642
    10          Canada   12034.670299             1.116822
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 08. Define a Python function `melt_countries_ir_cfr()` which transforms the output of `unify_countries_ir_and_cfr` from wide format to long format.

In [None]:
def melt_countries_ir_cfr() -> pd.core.frame.DataFrame:
    """
    >>> countries_ir_cfr_long = melt_countries_ir_cfr()
    >>> type(countries_ir_cfr_long)
    pandas.core.frame.DataFrame
    >>> countries_ir_cfr_long.shape
    (22, 3)
    >>> print(countries_ir_cfr_long)
        Country_Region             Variable         Value
    0        Singapore  Case_Fatality_Ratio      0.077345
    1     Korea, South  Case_Fatality_Ratio      0.111341
    2      New Zealand  Case_Fatality_Ratio      0.114355
    3        Australia  Case_Fatality_Ratio      0.170442
    4          Taiwan*  Case_Fatality_Ratio      0.177235
    5            Japan  Case_Fatality_Ratio      0.217879
    6           France  Case_Fatality_Ratio      0.416811
    7          Germany  Case_Fatality_Ratio      0.440374
    8   United Kingdom  Case_Fatality_Ratio      0.897117
    9               US  Case_Fatality_Ratio      1.082642
    10          Canada  Case_Fatality_Ratio      1.116822
    11          Canada        Incident_Rate  12034.670299
    12           Japan        Incident_Rate  26335.922781
    13              US        Incident_Rate  31128.127922
    14  United Kingdom        Incident_Rate  36640.306538
    15       Singapore        Incident_Rate  38055.837068
    16         Taiwan*        Incident_Rate  41865.185358
    17       Australia        Incident_Rate  44644.520556
    18         Germany        Incident_Rate  45900.900452
    19     New Zealand        Incident_Rate  45951.308542
    20          France        Incident_Rate  58459.281558
    21    Korea, South        Incident_Rate  59540.664028
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 09. Define a Python function named `import_imdb_top250()` which imports `imdb_top250.csv` in working directory.

In [None]:
def import_imdb_top250() -> pd.DataFrame:
    """
    >>> imdb_top250 = import_imdb_top250()
    >>> type(imdb_top250)
    pandas.core.frame.DataFrame
    >>> imdb_top250.shape
    (250, 5)
    >>> print(imdb_top250)
         Unnamed: 0                          Rank & Title  IMDb Rating   
    0           NaN  1.  The Shawshank Redemption  (1994)          9.2  \
    1           NaN             2.  The Godfather  (1972)          9.2   
    2           NaN           3.  The Dark Knight  (2008)          9.0   
    3           NaN     4.  The Godfather Part II  (1974)          9.0   
    4           NaN              5.  12 Angry Men  (1957)          9.0   
    ..          ...                                   ...          ...   
    245         NaN           246.  Life of Brian  (1979)          8.0   
    246         NaN          247.  The Iron Giant  (1999)          8.0   
    247         NaN                248.  The Help  (2011)          8.0   
    248         NaN                 249.  Aladdin  (1992)          8.0   
    249         NaN      250.  Dances with Wolves  (1990)          8.0   

                                Your Rating  Unnamed: 4  
    0    12345678910 NOT YET RELEASED  Seen         NaN  
    1    12345678910 NOT YET RELEASED  Seen         NaN  
    2    12345678910 NOT YET RELEASED  Seen         NaN  
    3    12345678910 NOT YET RELEASED  Seen         NaN  
    4    12345678910 NOT YET RELEASED  Seen         NaN  
    ..                                  ...         ...  
    245  12345678910 NOT YET RELEASED  Seen         NaN  
    246  12345678910 NOT YET RELEASED  Seen         NaN  
    247  12345678910 NOT YET RELEASED  Seen         NaN  
    248  12345678910 NOT YET RELEASED  Seen         NaN  
    249  12345678910 NOT YET RELEASED  Seen         NaN  

    [250 rows x 5 columns]
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## 10. Define a Python function named `tidy_imdb_top250()` that is able to tidy the dataframe obtained from the previous question into a desired format.

In [None]:
def tidy_imdb_top250() -> pd.DataFrame:
    """
    >>> tidied_imdb_top250 = tidy_imdb_top250()
    >>> type(tidied_imdb_top250)
    pandas.core.frame.DataFrame
    >>> tidied_imdb_top250.shape
    (250, 4)
    >>> print(tidied_imdb_top250)
         rank                     title release_year  rating
    0       1  The Shawshank Redemption         1994     9.2
    1       2             The Godfather         1972     9.2
    2       3           The Dark Knight         2008     9.0
    3       4     The Godfather Part II         1974     9.0
    4       5              12 Angry Men         1957     9.0
    ..    ...                       ...          ...     ...
    245   246             Life of Brian         1979     8.0
    246   247            The Iron Giant         1999     8.0
    247   248                  The Help         2011     8.0
    248   249                   Aladdin         1992     8.0
    249   250        Dances with Wolves         1990     8.0

    [250 rows x 4 columns]
    """
    ### BEGIN SOLUTION
    
    ### END SOLUTION

## Running tests

Assignment session is finished, click Runtime -> Restart and run all to run the following tests.

In [None]:
class TestAssignmentSix(unittest.TestCase):
    def test_01_import_lookup_table_and_daily_report(self):
        lookup_table, daily_report = import_lookup_table_and_daily_report()
        self.assertIsInstance(lookup_table, pd.core.frame.DataFrame)
        self.assertEqual(lookup_table.shape, (4321, 12))
        self.assertIsInstance(daily_report, pd.core.frame.DataFrame)
        self.assertEqual(daily_report.shape, (4016, 14))
    def test_02_inner_join_lookup_table_and_daily_report(self):
        lookup_table_and_daily_report = inner_join_lookup_table_and_daily_report()
        self.assertIsInstance(lookup_table_and_daily_report, pd.core.frame.DataFrame)
        self.assertEqual(lookup_table_and_daily_report.shape[1], 9)
    def test_03_create_combined_keys(self):
        combined_keys = create_combined_keys()
        self.assertIsInstance(combined_keys, pd.core.series.Series)
        self.assertIn("Afghanistan", combined_keys.values)
        self.assertIn("Albania", combined_keys.values)
        self.assertIn("US-Wyoming-Uinta", combined_keys.values)
        self.assertIn("US-Wyoming-Washakie", combined_keys.values)
        self.assertIn("US-Wyoming-Weston", combined_keys.values)
    def test_04_calculate_incident_rate_by_country_region(self):
        incident_rate_by_country_region = calculate_incident_rate_by_country_region()
        self.assertIsInstance(incident_rate_by_country_region, pd.core.series.Series)
        self.assertIn("Afghanistan", incident_rate_by_country_region.index)
        self.assertIn("Albania", incident_rate_by_country_region.index)
        self.assertIn("Yemen", incident_rate_by_country_region.index)
        self.assertIn("Zambia", incident_rate_by_country_region.index)
        self.assertIn("Zimbabwe", incident_rate_by_country_region.index)
    def test_05_find_countries_incident_rate(self):
        countries_incident_rate = find_countries_incident_rate()
        self.assertIsInstance(countries_incident_rate, pd.core.series.Series)
        self.assertEqual(countries_incident_rate.shape, (11,))
        country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
        for country in country_list:
            self.assertIn(country, countries_incident_rate.index)
    def test_06_calculate_case_fatality_ratio_by_country_region(self):
        case_fatality_ratio_by_country_region = calculate_case_fatality_ratio_by_country_region()
        self.assertIsInstance(case_fatality_ratio_by_country_region, pd.core.series.Series)
        self.assertIn("Afghanistan", case_fatality_ratio_by_country_region.index)
        self.assertIn("Albania", case_fatality_ratio_by_country_region.index)
        self.assertIn("Yemen", case_fatality_ratio_by_country_region.index)
        self.assertIn("Zambia", case_fatality_ratio_by_country_region.index)
        self.assertIn("Zimbabwe", case_fatality_ratio_by_country_region.index)
    def test_07_unify_countries_ir_and_cfr(self):
        countries_ir_and_cfr = unify_countries_ir_and_cfr()
        self.assertIsInstance(countries_ir_and_cfr, pd.core.frame.DataFrame)
        self.assertEqual(countries_ir_and_cfr.shape, (11, 3))
        country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
        for country in country_list:
            self.assertIn(country, countries_ir_and_cfr["Country_Region"].values)
    def test_08_melt_countries_ir_cfr(self):
        countries_ir_cfr_long = melt_countries_ir_cfr()
        self.assertIsInstance(countries_ir_cfr_long, pd.core.frame.DataFrame)
        self.assertEqual(countries_ir_cfr_long.shape, (22, 3))
        country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
        for country in country_list:
            self.assertIn(country, countries_ir_cfr_long["Country_Region"].unique())
        self.assertIn("Case_Fatality_Ratio", countries_ir_cfr_long.iloc[:, 1].unique())
        self.assertIn("Incident_Rate", countries_ir_cfr_long.iloc[:, 1].unique())
    def test_09_import_imdb_top250(self):
        imdb_top250 = import_imdb_top250()
        self.assertIsInstance(imdb_top250, pd.core.frame.DataFrame)
        self.assertEqual(imdb_top250.shape, (250, 5))
    def test_10_tidy_imdb_top250(self):
        tidied_imdb_top250 = tidy_imdb_top250()
        self.assertIsInstance(tidied_imdb_top250, pd.core.frame.DataFrame)
        self.assertEqual(tidied_imdb_top250.shape, (250, 4))
        self.assertIn("The Shawshank Redemption", tidied_imdb_top250.iloc[:, 1].values)
        self.assertIn("The Godfather", tidied_imdb_top250.iloc[:, 1].values)
        self.assertIn("The Dark Knight", tidied_imdb_top250.iloc[:, 1].values)
        self.assertIn("Top Gun: Maverick", tidied_imdb_top250.iloc[:, 1].values)
        self.assertIn("Dances with Wolves", tidied_imdb_top250.iloc[:, 1].values)
        self.assertEqual(tidied_imdb_top250.iloc[:, 0].nunique(), 250)
        
suite = unittest.TestLoader().loadTestsFromTestCase(TestAssignmentSix)
runner = unittest.TextTestRunner(verbosity=2)
test_results = runner.run(suite)
number_of_failures = len(test_results.failures)
number_of_errors = len(test_results.errors)
number_of_test_runs = test_results.testsRun
number_of_successes = number_of_test_runs - (number_of_failures + number_of_errors)

In [None]:
print("You've got {} successes among {} questions.".format(number_of_successes, number_of_test_runs))