In [2]:
import json
import os
from pathlib import Path
from typing import Dict, List, Union

import pandas as pd
import requests

In [3]:
URL: str = "https://bites-data.s3.us-east-2.amazonaws.com/MonthlySales.csv"
STATS: List[str] = ["sum", "mean", "max"]
TMP: Path = Path(os.getenv("TMP", "/tmp")) / "MonthlySales.csv"

In [4]:
def get_data(url: str) -> Dict[str, str]:
    """Get data from Github

    Args:
        url (str): The URL where the data is located.

    Returns:
        Dict[str, str]: The dictionary extracted from the data
    """
    if TMP.exists():
        data = json.loads(TMP.read_text())
    else:
        response = requests.get(url)
        response.raise_for_status()
        data = json.loads(response.text)
        with TMP.open("w") as tmp:
            json.dump(data, tmp)
    return data

In [5]:
def process_data(url: str) -> pd.DataFrame:
    """Process the data from the Github API

    Args:
        url (str): The URL where the data is located.

    Returns:
        pd.DataFrame: Pandas DataFrame generated from the processed data
    """
    data = get_data(url)
    return pd.read_csv(data)

In [6]:
csv_file = get_data(url=URL)['download_url']

In [7]:
df = pd.read_csv(csv_file)

In [8]:
df.columns

Index(['month', 'sales'], dtype='object')

In [9]:
df['year'] = pd.DatetimeIndex(df['month']).year

In [10]:
# Rename month column.
df['date'] = df['month']

In [11]:
# Change original month column (YYYY-MM-DD) to month only.
df['month'] = pd.DatetimeIndex(df['date']).month

In [12]:
df.head()

Unnamed: 0,month,sales,year,date
0,1,14236.9,2013,2013-01-01
1,2,4519.89,2013,2013-02-01
2,3,55691.01,2013,2013-03-01
3,4,28295.35,2013,2013-04-01
4,5,23648.29,2013,2013-05-01


In [13]:
df.groupby('year')['sales'].agg(['sum', 'mean', 'max'])

Unnamed: 0_level_0,sum,mean,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,484247.51,40353.959167,81777.35
2014,470532.51,39211.0425,75972.56
2015,608473.83,50706.1525,97237.42
2016,733947.03,61162.2525,118447.83


In [14]:
print(df.groupby('year')['sales'].agg(['sum', 'mean', 'max']))

            sum          mean        max
year                                    
2013  484247.51  40353.959167   81777.35
2014  470532.51  39211.042500   75972.56
2015  608473.83  50706.152500   97237.42
2016  733947.03  61162.252500  118447.83


In [15]:
df.query('year == "2013"').groupby('month')['sales'].sum()

month
1     14236.90
2      4519.89
3     55691.01
4     28295.35
5     23648.29
6     34595.13
7     33946.39
8     27909.47
9     81777.35
10    31453.39
11    78628.72
12    69545.62
Name: sales, dtype: float64

In [21]:
2013 in list(df['year'])

True

In [24]:
print(df.query('year == "2013"').groupby('month')['sales'].sum())

month
1     14236.90
2      4519.89
3     55691.01
4     28295.35
5     23648.29
6     34595.13
7     33946.39
8     27909.47
9     81777.35
10    31453.39
11    78628.72
12    69545.62
Name: sales, dtype: float64


In [30]:
yr_2013 = list(df.query('year == "2013"').groupby('month')['sales'].sum())

In [34]:
list(enumerate(yr_2013, start=1))

[(1, 14236.9),
 (2, 4519.89),
 (3, 55691.01),
 (4, 28295.35),
 (5, 23648.29),
 (6, 34595.13),
 (7, 33946.39),
 (8, 27909.47),
 (9, 81777.35),
 (10, 31453.39),
 (11, 78628.72),
 (12, 69545.62)]