# Data Extraction

In [2]:
import requests
import json
import numpy as np
from dotenv import load_dotenv
import os
from Crypto.Hash import MD5
import time

**WARNING**: The data extraction process takes about 25 minutes. Alternatively, you can run the cell after the extraction to load the returned API packets that were saved to disk during the previous extraction.

In [4]:
class DataExtractor:

    __url:str = "http://gateway.marvel.com/v1/public/"
    __endpoints:str = [
        "series", 
        "stories", 
        "comics",
        "characters", 
        "events", 
        "creators",
    ]
    __priv_key:str = None
    __pub_key:str = None
    __limit_per_packet = 20

    def __init__(self):  
        """
        Extracts data from the Marvel API (developer.marvel.com).
        In order to create an object of this class, there must exist a .env file with the API key and private key, 
        necessary to access the API, under the names "PUB_KEY" and "PRIV_KEY".
        """

        load_dotenv()
        self.__priv_key = os.getenv("PRIV_KEY")
        self.__pub_key = os.getenv("PUB_KEY")

    def __format_base_request_url(self, endpoint:str) -> str:
        """
        Formats the URL to be used in a request, using the timestamp, public and private keys and the desired endpoint.

        Args:
            endpoint (str): The endpoint of the request URL.

        Returns:
            str: The formatted request URL.

        Raises:
            ValueError: If the provided endpoint is not valid.
        """

        # Verifies the endpoint exists in the API
        if endpoint not in self.__endpoints:
            raise ValueError(f"{endpoint} is not a valid endpoint.")

        timestamp = time.time()
        timestamp_str = f"ts={timestamp}"

        apikey_str = "apikey=" + self.__pub_key

        # Calculates the MD5 hash, needed for the URL
        md5_message = str(timestamp) + self.__priv_key + self.__pub_key
        hash_str = "hash=" + MD5.new(str.encode(md5_message)).hexdigest()

        request_url = self.__url + endpoint + "?" + timestamp_str + "&" + apikey_str + "&" + hash_str
        return request_url
    
    def __make_request(self, request_url:str) -> list[dict]:
        """
        Makes a request to the API.

        Args:
            request_url (str): The request URL.

        Returns:
            list[dict]: The data of the response in JSON, each item in the list is an entry on the database.

        Raises:
            requests.HTTPError: If the received status code is not 200.
        """

        # Makes the request
        response = requests.get(request_url, timeout=180)

        if response.status_code != 200:
            raise requests.HTTPError(f"Received status code {response.status_code} for {response.url}")
        
        # Filters metadata out
        return response.json()

    def extract(self) -> dict:
        """
        Extracts the data from all API endpoints.
        
        Returns:
            The API data, where the key-value pairs are the name of the resource and its 
            data in JSON format (python dictionary), if the request was successful, and None otherwise.
        """

        api_data = {}

        # For each endpoints, tries to extract the data and add it to the api_data dictionary
        for endpoint in self.__endpoints:

            print(f"Extracting endpoint {endpoint}...")

            endpoint_data = []
            offset = 0
            received_data = {"data" : {"results" : np.zeros(self.__limit_per_packet)} } # Initialization for while condition
    
            # While still receiving the maximum limit instances per packet, keep requesting the next ones
            while offset < 1000 and len(received_data["data"]["results"]) == self.__limit_per_packet:
                # Tries a request, if it fails the offset is not increased and the request is retried
                try: 
                    base_url = self.__format_base_request_url(endpoint)
                    received_data = self.__make_request(base_url + f"&limit={self.__limit_per_packet}&offset={offset}")

                    print(f"Received package with offset {offset}.")
                    
                    # Adds packet to the endpoint_data list
                    endpoint_data.append(received_data) 

                    offset += self.__limit_per_packet

                except Exception as e:
                    print(f"Error extracting endpoint {endpoint}: {e}.")

            # When all instances from endpoint have been extracted, adds the data to the api_data dictionary
            api_data[endpoint] = endpoint_data
            with open(f"./data/json/{endpoint}.json", "w") as f:
                json.dump(endpoint_data, f)
        
        return api_data
    
de = DataExtractor()
data_api = de.extract()

Extracting endpoint series...
Received package with offset 0.
Received package with offset 20.
Received package with offset 40.
Received package with offset 60.
Received package with offset 80.
Received package with offset 100.
Received package with offset 120.
Received package with offset 140.
Received package with offset 160.
Received package with offset 180.
Received package with offset 200.
Received package with offset 220.
Received package with offset 240.
Received package with offset 260.
Received package with offset 280.
Received package with offset 300.
Received package with offset 320.
Received package with offset 340.
Received package with offset 360.
Received package with offset 380.
Received package with offset 400.
Received package with offset 420.
Received package with offset 440.
Received package with offset 460.
Received package with offset 480.
Received package with offset 500.
Received package with offset 520.
Received package with offset 540.
Received package with of

In [25]:
json_files = [f for f in os.listdir("./data/json/") if os.path.isfile(os.path.join("./data/json/" + f))]

data_api = {}
for file in json_files:
    with open("./data/json/" + file, 'r') as file:
        data_api[str(file).split("/")[-1].split(".")[0]] = json.load(file)

# Data Transformation

In [33]:
import pandas as pd

In [80]:
class DataTransformator:
    
    __data_raw:dict[str,list]
    __data_json:dict[str, dict]
    __entity_DFs:dict[str, pd.DataFrame]
    __relationship_DFs:dict[str, pd.DataFrame]

    __many_to_many_relationships:str = [ 
        ("comics", "characters"),
        ("stories", "characters"),
        ("comics", "creators"),
        ("stories", "creators"),
        ("events", "comics"),
    ]

    def __init__(self, data_raw:dict[str,list]):
        """
        Converts the data returned by the Marvel API into Pandas' DataFrames, cleaning the data.

        Args:
            data_raw: Raw packets received from the API, identified by the resource name as the dictionary key.
        """

        self.__data_raw = data_raw
        self.__data_json = {}
        self.__entity_DFs = {}
        self.__relationship_DFs = {}

    # STATIC METHODS
    # These are used with pd.DataFrame.apply() in the other class methods

    @staticmethod
    def __convert_timestamp(timestamp:str) -> pd.Timestamp:
        """
        Converts a string in Pandas' Timestamp.
        
        Args:
            timestamp: The timestamp in string type.

        Returns:
            pd.Timestamp: The timestamp in pd.Timestamp format, or None if string was invalid.
        """

        if timestamp == None or timestamp[0] == "-":
            return None
        else:
            return pd.to_datetime(timestamp)
        
    @staticmethod
    def __get_resources_ids(original_dict:dict) -> list[int]:
        """
        Get resources' IDs from a collection of resources.

        Args:
            original_dict: Collection of resources returned by the API.
        
        Returns:
            list[int]: List of IDs of all the resources on the collection.
        """

        items = original_dict["items"]

        ids = []
        for item in items:
            ids.append(int(item["resourceURI"].split("/")[-1]))

        return ids
    
    @staticmethod
    def __get_resource_id(original_dict:dict) -> int:
        """
        Get resource's IDs from a resource.

        Args:
            original_dict: Resource returned by the API.
        
        Returns:
            int: ID of the resource.
        """

        if original_dict != None:
            return original_dict["resourceURI"].split("/")[-1]
        else:
            return None
    
    @staticmethod
    def __get_sale_date(dates:list[dict]) -> pd.Timestamp:
        """
        Selects the sale date from the list of dates.

        Args:
            dates: Dictionary of dates returned from the API.
        
        Returns:
            pd.Timestamp: Sale date.
        """

        for item in dates:
            if item["type"] == "onsaleDate":
                return pd.to_datetime(item["date"])
            
    @staticmethod
    def __get_foc_date(dates:list[dict]) -> pd.Timestamp:
        """
        Selects the FOC date from the list of dates.

        Args:
            dates: Dictionary of dates returned from the API.
        
        Returns:
            pd.Timestamp: FOC date.
        """

        for item in dates:
            if item["type"] == "focDate":
                return DataTransformator.__convert_timestamp(item["date"])
            
    @staticmethod
    def __get_print_price(prices:list[dict]) -> float:
        """
        Selects the print price from the list of prices.

        Args:
            prices: Dictionary of prices returned from the API.
        
        Returns:
            pd.Timestamp: Print price.
        """

        for price in prices:
            if price["type"] == "printPrice":
                return price["price"]
            
    @staticmethod
    def __get_digital_price(prices:list[dict]) -> float:
        """
        Selects the digital price from the list of prices.

        Args:
            prices: Dictionary of prices returned from the API.
        
        Returns:
            pd.Timestamp: Digital price.
        """

        for price in prices:
            if price["type"] == "digitalPurchasePrice":
                return price["price"]
            
    # PRIVATE CLASS METHODS
    # These are used by the public method "transform".

    def __filter_metadata_out(self):
        """
        Removes metadata from the packets returned by the API, stored in self.__data_raw, and stores the result in self.__data_json.
        """
        
        # For each resource, iterates over the packets and adds them to resource_data
        for resource in self.__data_raw.keys():
            resource_data = [] 
            for instance in self.__data_raw[resource]:
                resource_data += instance["data"]["results"]

            # When all resource's packets are in resource_data, its added to the __data_json dictionary
            self.__data_json[resource] = resource_data
    
    def __convert_to_dataframes(self):
        """
        Converts the data returned by the API in JSON, stored in self.__data_json, into DataFrame objects and stores into self.__entity_DFs.
        """

        self.__entity_DFs = {}

        for entity, json in self.__data_json.items():
            self.__entity_DFs[entity] = pd.DataFrame(json)
            self.__entity_DFs[entity].set_index("id", inplace=True)

    
    def __create_new_columns(self):
        """
        Separates columns of type list[dict] with multiple information into multiple columns, each with one information.
        """

        self.__entity_DFs["comics"]["saleDate"] = self.__entity_DFs["comics"]["dates"].apply(self.__get_sale_date)
        self.__entity_DFs["comics"]["focDate"] = self.__entity_DFs["comics"]["dates"].apply(self.__get_foc_date)

        self.__entity_DFs["comics"]["printPrice"] = self.__entity_DFs["comics"]["prices"].apply(self.__get_print_price)
        self.__entity_DFs["comics"]["digitalPrice"] = self.__entity_DFs["comics"]["prices"].apply(self.__get_digital_price)

    def __convert_datatypes(self):
        """
        Converts timestamp columns from string to pd.Timestamp.
        """

        self.__entity_DFs["characters"]["modified"] = self.__entity_DFs["characters"]["modified"].apply(self.__convert_timestamp)
        self.__entity_DFs["comics"]["modified"] = self.__entity_DFs["comics"]["modified"].apply(self.__convert_timestamp)
        self.__entity_DFs["creators"]["modified"] = self.__entity_DFs["creators"]["modified"].apply(self.__convert_timestamp)
        self.__entity_DFs["series"]["modified"] = self.__entity_DFs["series"]["modified"].apply(self.__convert_timestamp)
        self.__entity_DFs["stories"]["modified"] = self.__entity_DFs["stories"]["modified"].apply(self.__convert_timestamp)
        self.__entity_DFs["events"]["modified"] = self.__entity_DFs["events"]["modified"].apply(self.__convert_timestamp)

        self.__entity_DFs["events"]["start"] = self.__entity_DFs["events"]["start"].apply(self.__convert_timestamp)
        self.__entity_DFs["events"]["end"] = self.__entity_DFs["events"]["end"].apply(self.__convert_timestamp)

    def __identify_entities_by_id(self):
        """
        Operates on columns where each cell has type list[dict], storing lists of entities returned from the API. 
        Extracts only the ID of the entity, making the column's cells of type list[int].
        """

        self.__entity_DFs["stories"]["events"] = self.__entity_DFs["stories"]["events"].apply(self.__get_resources_ids)
        self.__entity_DFs["stories"]["creators"] = self.__entity_DFs["stories"]["creators"].apply(self.__get_resources_ids)
        self.__entity_DFs["stories"]["series"] = self.__entity_DFs["stories"]["series"].apply(self.__get_resources_ids)
        self.__entity_DFs["stories"]["comics"] = self.__entity_DFs["stories"]["comics"].apply(self.__get_resources_ids)
        self.__entity_DFs["stories"]["characters"] = self.__entity_DFs["stories"]["characters"].apply(self.__get_resources_ids)

        self.__entity_DFs["series"]["comics"] = self.__entity_DFs["series"]["comics"].apply(self.__get_resources_ids)
        self.__entity_DFs["series"]["events"] = self.__entity_DFs["series"]["events"].apply(self.__get_resources_ids)
        self.__entity_DFs["series"]["stories"] = self.__entity_DFs["series"]["stories"].apply(self.__get_resources_ids)
        self.__entity_DFs["series"]["creators"] = self.__entity_DFs["series"]["creators"].apply(self.__get_resources_ids)
        self.__entity_DFs["series"]["characters"] = self.__entity_DFs["series"]["characters"].apply(self.__get_resources_ids)
        self.__entity_DFs["series"]["next"] = self.__entity_DFs["series"]["next"].apply(self.__get_resource_id)
        self.__entity_DFs["series"]["previous"] = self.__entity_DFs["series"]["previous"].apply(self.__get_resource_id)

        self.__entity_DFs["events"]["comics"] = self.__entity_DFs["events"]["comics"].apply(self.__get_resources_ids)
        self.__entity_DFs["events"]["series"] = self.__entity_DFs["events"]["series"].apply(self.__get_resources_ids)
        self.__entity_DFs["events"]["stories"] = self.__entity_DFs["events"]["stories"].apply(self.__get_resources_ids)
        self.__entity_DFs["events"]["creators"] = self.__entity_DFs["events"]["creators"].apply(self.__get_resources_ids)
        self.__entity_DFs["events"]["characters"] = self.__entity_DFs["events"]["characters"].apply(self.__get_resources_ids)
        self.__entity_DFs["events"]["next"] = self.__entity_DFs["events"]["next"].apply(self.__get_resource_id)
        self.__entity_DFs["events"]["previous"] = self.__entity_DFs["events"]["previous"].apply(self.__get_resource_id)

        self.__entity_DFs["creators"]["comics"] = self.__entity_DFs["creators"]["comics"].apply(self.__get_resources_ids)
        self.__entity_DFs["creators"]["series"] = self.__entity_DFs["creators"]["series"].apply(self.__get_resources_ids)
        self.__entity_DFs["creators"]["stories"] = self.__entity_DFs["creators"]["stories"].apply(self.__get_resources_ids)
        self.__entity_DFs["creators"]["events"] = self.__entity_DFs["creators"]["events"].apply(self.__get_resources_ids)

        self.__entity_DFs["comics"]["series"] = self.__entity_DFs["comics"]["series"].apply(self.__get_resource_id)
        self.__entity_DFs["comics"]["stories"] = self.__entity_DFs["comics"]["stories"].apply(self.__get_resources_ids)
        self.__entity_DFs["comics"]["events"] = self.__entity_DFs["comics"]["events"].apply(self.__get_resources_ids)
        self.__entity_DFs["comics"]["creators"] = self.__entity_DFs["comics"]["creators"].apply(self.__get_resources_ids)
        self.__entity_DFs["comics"]["characters"] = self.__entity_DFs["comics"]["characters"].apply(self.__get_resources_ids)

        self.__entity_DFs["characters"]["comics"] = self.__entity_DFs["characters"]["comics"].apply(self.__get_resources_ids)
        self.__entity_DFs["characters"]["series"] = self.__entity_DFs["characters"]["series"].apply(self.__get_resources_ids)
        self.__entity_DFs["characters"]["stories"] = self.__entity_DFs["characters"]["stories"].apply(self.__get_resources_ids)
        self.__entity_DFs["characters"]["events"] = self.__entity_DFs["characters"]["events"].apply(self.__get_resources_ids)

    def __drop_irrelevant_columns(self):
        """
        Drops colums that were deemed irrelevant for the analysis in this notebook.
        """

        self.__entity_DFs["events"].drop(["description", "thumbnail", "resourceURI", "urls", "comics", "series", 
                                          "stories", "characters", "creators"], axis=1, inplace=True)

        self.__entity_DFs["stories"].drop(["description", "thumbnail", "resourceURI", "type", "originalIssue", 
                                           "creators", "characters", "series", "events"], axis=1, inplace=True)

        self.__entity_DFs["series"].drop(["description", "thumbnail", "resourceURI", "urls", "comics", "stories", 
                                          "creators", "characters", "events"], axis=1, inplace=True)
        
        self.__entity_DFs["creators"].drop(["thumbnail", "resourceURI", "urls", "firstName", "middleName", "lastName", 
                                            "suffix", "comics", "series", "stories", "events"], axis=1, inplace=True)
        
        self.__entity_DFs["characters"].drop(["description", "thumbnail", "resourceURI", "urls", "comics", "series", 
                                              "stories", "events"], axis=1, inplace=True)
        
        self.__entity_DFs["comics"].drop(["digitalId", "variantDescription", "description", "isbn", "resourceURI", "urls", "upc", 
                                      "diamondCode", "ean", "issn", "format", "variants", "textObjects", "collections", 
                                      "collectedIssues", "thumbnail", "images", "dates", "prices", "creators", "characters",
                                      "stories", "events"], axis=1, inplace=True)
    
    def __create_relationship_dataframes(self):
        """
        Create dataframes to store many to many relationships, using columns that stores lists of foreign keys.
        """

        # For all the identified many to many relationships
        for entity1, entity2 in self.__many_to_many_relationships:

            # Selects only the column with the list of foreign keys and explodes that column
            relationship_df = self.__entity_DFs[entity1][[entity2]].explode(entity2)

            # Drop instances of entity1 that had no foreign keys (empty list)
            relationship_df.dropna(inplace=True)

            # Move IDs of entity1 from dataframe index to column
            relationship_df.reset_index(inplace=True)

            relationship_df.columns = [entity1, entity2]

            self.__relationship_DFs[f"{entity1}_{entity2}"] = relationship_df
            
    
    def transform(self, data_raw:dict[str, list] = None) -> dict[str, pd.DataFrame]:
        """
        Cleans the data from the raw packages received from the Marvel API to Pandas' DataFrames. Uses preferencially the data 
        passed as argument, if it's None uses the data passed when creating the class or when this method was last called.

        Args:
            data_raw: Raw packets received from the API, identified by the resource name as the dictionary key.

        Returns:
            dict[str, pd.DataFrame]: Generated DataFrames, identified by the name of the entity or the realtionship.
        """

        # If the data is passed as an argument, use it
        if(data_raw != None):
            self.__data_raw = data_raw

        # Apply transformations
        print("Removing metadata...")
        self.__filter_metadata_out()
        print("Converting to dataFrames,,,")
        self.__convert_to_dataframes()
        print("Creating new columns,,,")
        self.__create_new_columns()
        print("Convering datatypes,,,")
        self.__convert_datatypes()
        print("Extracting IDs...")
        self.__identify_entities_by_id()
        print("Creating many to many relationships dataframes...")
        self.__create_relationship_dataframes()
        print("Dropping irrelevant columns...")
        self.__drop_irrelevant_columns()

        # Save transformed data to disk
        print("Saving transformed data to disk...")
        for entity, df in self.__entity_DFs.items():
            df.to_parquet(f"data/dataframes/entity_{entity}.parquet")
        for relationship, df in self.__relationship_DFs.items():
            df.to_parquet(f"data/dataframes/relationship_{relationship}.parquet")

        return self.__entity_DFs | self.__relationship_DFs
    
dt = DataTransformator(data_raw=data_api)
data_DFs = dt.transform()

Removing metadata...
Converting to dataFrames,,,
Creating new columns,,,
Convering datatypes,,,
Extracting IDs...
Creating many to many relationships dataframes...
Dropping irrelevant columns...
Saving transformed data to disk...


## Entities DataFrames

In [89]:
data_DFs["characters"].head(5)

Unnamed: 0_level_0,name,modified
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1011334,3-D Man,2014-04-29 14:18:17-04:00
1017100,A-Bomb (HAS),2013-09-18 15:54:04-04:00
1009144,A.I.M.,2013-10-17 14:41:30-04:00
1010699,Aaron Stack,1969-12-31 19:00:00-05:00
1009146,Abomination (Emil Blonsky),2014-06-27 19:39:07-04:00


In [90]:
data_DFs["creators"].head(5)

Unnamed: 0_level_0,fullName,modified
id,Unnamed: 1_level_1,Unnamed: 2_level_1
13970,#O,2019-12-11 17:10:07-05:00
13971,#X,2019-12-11 17:21:29-05:00
6606,A.R.K.,2007-01-02 00:00:00-05:00
1168,All Thumbs Creative,2018-07-24 11:50:20-04:00
7470,ALSJOERDSMA,2007-01-02 00:00:00-05:00


In [91]:
data_DFs["events"].head(5)

Unnamed: 0_level_0,title,modified,start,end,next,previous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
116,Acts of Vengeance!,2013-06-28 16:31:24-04:00,1989-12-10,2008-01-04,240.0,233.0
227,Age of Apocalypse,2014-06-13 11:42:39-04:00,1995-03-01,1996-06-01,239.0,219.0
314,Age of Ultron,2014-03-25 15:39:52-04:00,2013-03-06,2013-06-19,315.0,311.0
303,Age of X,2013-06-28 18:34:27-04:00,2011-01-26,2011-05-01,302.0,296.0
329,All-New All-Different Marvel,2015-10-01 17:37:36-04:00,NaT,NaT,,


In [92]:
data_DFs["series"].head(5)

Unnamed: 0_level_0,title,startYear,endYear,rating,type,modified,next,previous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
31445,Fantastic Four by Dan Slott Vol. 1 (2021),2021,2021,,collection,2020-07-29 09:04:18-04:00,,
26024,Superior Spider-Man Vol. 2: Otto-matic (2019),2019,2019,,collection,2019-12-13 16:23:45-05:00,,
18454,100th Anniversary Special (2014),2014,2014,Rated T,limited,2019-10-01 18:42:55-04:00,,
13379,15 Love (2011),2011,2011,,,2015-09-22 17:11:46-04:00,,
13380,15-Love GN-TPB (2013 - Present),2013,2099,,,2011-10-14 16:22:04-04:00,,


In [93]:
data_DFs["stories"].head(5)

Unnamed: 0_level_0,title,modified,comics
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,"Investigating the murder of a teenage girl, Ca...",1969-12-31 19:00:00-05:00,[941]
8,"In the wake of September 11th, the world watch...",1969-12-31 19:00:00-05:00,[942]
9,Ordinary New York City cop Frankie &QUOT;Gunz&...,1969-12-31 19:00:00-05:00,[943]
10,"In this thought-provoking anthology, a world-c...",2014-01-27 00:00:00-05:00,[944]
11,Interior #11,1969-12-31 19:00:00-05:00,[945]


In [94]:
data_DFs["comics"].head(5)

Unnamed: 0_level_0,title,issueNumber,modified,pageCount,series,saleDate,focDate,printPrice,digitalPrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
82967,Marvel Previews (2017),0.0,2019-11-07 08:46:15-05:00,112,23665,2099-10-30 00:00:00-05:00,2019-10-07 00:00:00-04:00,0.0,
82965,Marvel Previews (2017),0.0,2019-08-21 17:11:27-04:00,152,23665,2099-08-28 00:00:00-05:00,2019-08-05 00:00:00-04:00,0.0,
82970,Marvel Previews (2017),0.0,2020-02-07 09:35:32-05:00,112,23665,2099-01-29 00:00:00-05:00,2020-01-06 00:00:00-05:00,0.0,0.0
15094,Silver Surfer (1987),0.0,,0,2288,2029-12-31 00:00:00-05:00,,0.0,
1886,Official Handbook of the Marvel Universe (2004...,12.0,,0,787,2029-12-31 00:00:00-05:00,,3.99,


## Relationship dataframes

In [95]:
data_DFs["comics_characters"].head(5)

Unnamed: 0,comics,characters
0,1886,1009156
1,1886,1009197
2,1886,1009243
3,1886,1009313
4,1886,1009349


In [96]:
data_DFs["comics_creators"].head(5)

Unnamed: 0,comics,creators
0,82967,10021
1,82970,10021
2,1886,907
3,1886,887
4,1886,902


In [97]:
data_DFs["events_comics"].head(5)

Unnamed: 0,events,comics
0,116,12744
1,116,12746
2,116,7188
3,116,7189
4,116,7190


In [98]:
data_DFs["stories_characters"].head(5)

Unnamed: 0,stories,characters
0,236,1010892
1,354,1009288
2,459,1009175
3,459,1009478
4,459,1009327


In [99]:
data_DFs["stories_creators"].head(5)

Unnamed: 0,stories,creators
0,10,14458
1,10,13567
2,10,5187
3,10,365
4,10,4282


# Data analysis

In [100]:
import altair