In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('~/datahacks/datasets/dino.csv')

In [3]:
df = df.drop(['named_by', 'link'], axis = 1)

In [4]:
def extract_duration(row):
    """Takes period text and returns the time span"""
    pattern = r'(\d+)-(\d+)'
    match = re.search(pattern, row)
    if match:
        start_year = int(match.group(1))
        end_year = int(match.group(2))
        return np.abs(end_year - start_year)
    else:
        return 

In [5]:
# Create column extract time period
pattern = r'^(\w+\s\w+)'
df['time_period'] = df['period'].str.extract(pattern)

# Create column extract time span in million of years
df['time_span'] = df['period'].apply(extract_duration)

# Drop NaN values
jurassic_cleaned = df.dropna(subset=['time_span'])


In [6]:
countries_continents = { 'Argentina': 'South America', 'Australia': 'Australia', 'Brazil': 'South America',
                        'Canada': 'North America', 'China': 'Asia', 'Egypt': 'Africa', 'France': 'Europe',
                        'Germany': 'Europe', 'India': 'Asia', 'Japan': 'Asia', 'Kazakhstan': 'Asia', 
                        'Lesotho': 'Africa', 'Madagascar': 'Africa', 'Malawi': 'Africa', 'Mongolia': 'Asia',
                        'Morocco': 'Africa', 'Niger': 'Africa', 'North Africa': 'Africa', 'Romania': 'Europe', 
                        'Russia': 'Asia/Europe', 'South Africa': 'Africa', 'Spain': 'Europe', 'Tanzania': 'Africa', 
                        'Tunisia': 'Africa', 'USA': 'North America', 'United Kingdom': 'Europe',
                        'Uruguay': 'South America', 'Uzbekistan': 'Asia' } 

In [7]:
jurassic_cleaned = jurassic_cleaned.drop(['period'], axis = 1)
jurassic_cleaned['continent'] = jurassic_cleaned['lived_in'].map(countries_continents)
jurassic_cleaned

Unnamed: 0,name,diet,lived_in,type,length,taxonomy,species,time_period,time_span,continent
0,aardonyx,herbivorous,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,celestae,Early Jurassic,10.0,Africa
1,abelisaurus,carnivorous,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,comahuensis,Late Cretaceous,4.0,South America
2,achelousaurus,herbivorous,USA,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,horneri,Late Cretaceous,13.0,North America
3,achillobator,carnivorous,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,giganteus,Late Cretaceous,15.0,Asia
4,acrocanthosaurus,carnivorous,USA,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,atokensis,Early Cretaceous,10.0,North America
...,...,...,...,...,...,...,...,...,...,...
303,yinlong,herbivorous,China,ceratopsian,1.2m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,downsi,Mid Jurassic,5.0,Asia
304,yuanmousaurus,herbivorous,China,sauropod,17.0m,Dinosauria Saurischia Sauropodomorpha Sauropod...,jiangyiensis,Mid Jurassic,21.0,Asia
305,yunnanosaurus,omnivorous,China,sauropod,7.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,huangi,Early Jurassic,15.0,Asia
307,zephyrosaurus,herbivorous,USA,euornithopod,1.8m,Dinosauria Ornithischia Genasauria Cerapoda Or...,schaffi,Early Cretaceous,10.0,North America


In [8]:
late_t_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Late Triassic']
late_t_df.shape

(13, 10)

In [9]:
early_j_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Early Jurassic']
early_j_df.shape

(15, 10)

In [10]:
mid_j_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Mid Jurassic']
mid_j_df.shape

(23, 10)

In [11]:
late_j_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Late Jurassic']
late_j_df.shape

(42, 10)

In [12]:
early_c_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Early Cretaceous']
early_c_df.shape

(58, 10)

In [13]:
late_c_df = jurassic_cleaned[jurassic_cleaned['time_period'] == 'Late Cretaceous']
late_c_df.shape

(129, 10)

In [14]:
def separate_dataframes_by_time_period_and_continent(dataframe):
    """
    Separate a dataframe by time period and continent.

    Parameters:
    - dataframe: The input DataFrame containing the data.

    Returns:
    - A dictionary containing DataFrames separated by unique combinations of time period and continent.
    """

    separated_dataframes = {}

    time_periods = dataframe['time_period'].unique()

    continents = dataframe['continent'].unique()

    for time_period in time_periods:

        for continent in continents:

            filtered_df = dataframe[(dataframe['time_period'] == time_period) & (dataframe['continent'] == continent)]

            df_name = f"{time_period}, {continent}"
            

            separated_dataframes[df_name] = filtered_df

    return separated_dataframes


separated_dataframes = separate_dataframes_by_time_period_and_continent(jurassic_cleaned)



In [15]:
separated_dataframes

{'Early Jurassic, Africa':               name         diet      lived_in      type length  \
 0         aardonyx  herbivorous  South Africa  sauropod   8.0m   
 155  lesothosaurus  herbivorous       Lesotho      1.0m    NaN   
 
                                               taxonomy       species  \
 0    Dinosauria Saurischia Sauropodomorpha Prosauro...      celestae   
 155                            Dinosauria Ornithischia  diagnosticus   
 
         time_period  time_span continent  
 0    Early Jurassic       10.0    Africa  
 155  Early Jurassic       13.0    Africa  ,
 'Early Jurassic, South America': Empty DataFrame
 Columns: [name, diet, lived_in, type, length, taxonomy, species, time_period, time_span, continent]
 Index: [],
 'Early Jurassic, North America':                name         diet lived_in               type length  \
 16       ammosaurus  herbivorous      USA           sauropod   5.0m   
 248  scutellosaurus  herbivorous      USA  armoured dinosaur   1.2m   
 
   