In [None]:
from yugiquery import *
header('Timeline')

---

Table of Contents
=================

*   [1  Data preparation](#Data-preparation)
    *   [1.1  Load data](#Load-data)
    *   [1.2  Format data](#Format-data)
    *   [1.3  Merge data](#Merge-data)
*   [4  Data visualization](#Data-visualization)
    *   [4.1  First releases](#First-releases)
        *   [4.1.1  By region](#By-region)
    *   [4.1  Last releases](#Last-releases)
        *   [4.1.1  By region](#By-region)
    *   [4.2  All Releases](#All-Releases)
        *   [4.2.1  By card type](#By-card-type)
        *   [4.2.2  By primary type](#By-primary-type)
        *   [4.2.3  By secondary type](#By-secondary-type)
        *   [4.2.4  By attribute](#By-attribute)
        *   [4.2.5  By monster type](#By-monster-type)
        *   [4.2.6  By Level/Rank](#By-Level/Rank)
        *   [4.2.7  By ATK](#By-ATK)
        *   [4.2.8  By DEF](#By-DEF)
        *   [4.2.9  By pendulum scale](#By-pendulum-scale)
        *   [4.2.10  By link](#By-link)
*   [5  Debug](#Debug)
    *   [5.1  Merge failed](#Merge-failed)
*   [6  Epilogue](#Epilogue)
    *   [6.1  HTML export](#HTML-export)
<!-- *   [6.2  Git](#Git) -->

# Data loading

## Load preparation

In [None]:
# Load list of important dates
dates_df = pd.read_csv('../Assets/dates.csv', parse_dates=['begin','end'], dayfirst=True, index_col='name').sort_values(['type','begin','end'])
anime_df = dates_df[dates_df['type']=='series']
rules_df = dates_df[dates_df['type']=='rules'][2:]

In [None]:
# Get list of files
all_cards_files = sorted(glob.glob('../Data/All_cards_*.csv'), key=os.path.getctime, reverse=True)
set_lists_files = sorted(glob.glob('../Data/All_sets_*.csv'), key=os.path.getctime, reverse=True)
# Get newest file if exist
if len(all_cards_files)>0:
    all_cards_df = pd.read_csv(all_cards_files[0], dtype=object)
    # Correct tuples
    all_cards_df['Effect type'] = all_cards_df['Effect type'].dropna().apply(literal_eval)
    all_cards_df['Link Arrows'] = all_cards_df['Link Arrows'].dropna().apply(literal_eval)
    all_cards_df['Archseries'] = all_cards_df['Archseries'].dropna().apply(literal_eval)
    all_cards_df['Artwork'] = all_cards_df['Artwork'].dropna().apply(literal_eval)
    all_cards_df['Errata'] = all_cards_df['Errata'].dropna().apply(literal_eval)
    print('Cards file loaded')
else:
    all_cards_df = None
    print('No cards files')
    
if len(set_lists_files)>0:
    set_lists_df = pd.read_csv(set_lists_files[0], dtype=object)
    # Correct tuples
    set_lists_df['Rarity'] = set_lists_df['Rarity'].dropna().apply(literal_eval)
    set_lists_df['Cover card'] = set_lists_df['Cover card'].dropna().apply(literal_eval)
    print('Sets file loaded')
else:
    set_lists_df = None
    print('No set lists files')

## Format data

In [None]:
if all_cards_df is not None and set_lists_df is not None:
    all_cards_df['index'] = all_cards_df['Name'].str.lower()
    set_lists_df['index'] = set_lists_df['Name'].str.lower()
    all_cards_df['Modification date'] = pd.to_datetime(all_cards_df['Modification date'])
    set_lists_df['Modification date'] = pd.to_datetime(set_lists_df['Modification date'])
    set_lists_df['Release'] = pd.to_datetime(set_lists_df['Release'])
    
else:
    raise SystemExit("Not enough files to proceed. Aborting!")

## Merge data

In [None]:
full_df = all_cards_df.merge(set_lists_df, indicator = True, how='outer', on='index')
full_df = full_df.convert_dtypes()
full_df['Modification date'] = full_df[['Modification date_x','Modification date_y']].max(axis=1)
full_df['Name'] = full_df['Name_x'].fillna(full_df['Name_y'])
full_df.drop(['index', 'Name_x', 'Name_y', 'Modification date_x', 'Modification date_y'], axis=1, inplace = True)
full_df = full_df[np.append(full_df.columns[-1:],full_df.columns[:-1])]

## Save data

In [None]:
full_df.drop('_merge', axis=1).to_csv(f'../Data/Combined_data_{int(timestamp.timestamp())}.csv', index = False)
print('Data saved')

# Data visualization

In [None]:
pd.set_option('display.max_columns', 40)
full_df

## First releases

Obs: Only the first release of an individual card name

In [None]:
first_release=full_df[full_df['Release'].notna()].groupby('Name')['Release'].agg(['min','idxmin'])
first_release

In [None]:
first_release_count = first_release['min'].sort_values().value_counts(sort=False).to_frame(name='All cards')
first_release_count.index.name = 'First Release'
rate_plot(first_release_count, bg=anime_df, vlines = rules_df['begin'])


### By region

In [None]:
first_release_region=full_df[full_df['Release'].notna()].groupby(['Region','Name'])['Release'].agg(['min','idxmin'])
first_release_region

In [None]:
first_release_region_count = first_release_region['min'].sort_values().groupby(['Region']).value_counts(sort=False).unstack(0).fillna(0).round(0)
first_release_region_count.index.name = 'Release'
rate_subplots(first_release_region_count, title = 'First Release', bg=anime_df, vlines = rules_df['begin'])

## Last releases

Obs: Only the last release of an individual card name

In [None]:
last_release=full_df[full_df['Release'].notna()].groupby('Name')['Release'].agg(['max','idxmax'])
last_release

In [None]:
last_release_count = last_release['max'].sort_values().value_counts(sort=False).to_frame(name='All cards')
last_release_count.index.name = 'Last Release'
rate_plot(last_release_count, bg=anime_df, vlines = rules_df['begin'])

### By region

In [None]:
last_release_region=full_df[full_df['Release'].notna()].groupby(['Region','Name'])['Release'].agg(['max','idxmax'])
last_release_region

In [None]:
last_release_region_count = last_release_region['max'].sort_values().groupby(['Region']).value_counts(sort=False).unstack(0).fillna(0).round(0)
last_release_region_count.index.name = 'Release'
rate_subplots(last_release_region_count, title = 'Last Release', bg=anime_df, vlines = rules_df['begin'])

## All Releases

Obs: All releases includes reprints

### By card type

In [None]:
# All releases, includes reprints - Double check
release_card_type = full_df.groupby(['Card type','Release'])['Name'].nunique().unstack(0).sort_index().fillna(0).astype(int)
release_card_type.groupby(release_card_type.index.strftime('%Y')).sum().T

In [None]:
# card_type_colors = [colors_dict[col] for col in release_card_type.columns]
# rate_subplots(release_card_type, colors=card_type_colors, bg=anime_df, vlines = rules_df['begin'])

In [None]:
card_type_colors = [colors_dict[col] for col in release_card_type.columns]
rate_plot(release_card_type, colors=card_type_colors, bg=anime_df, vlines = rules_df['begin'])

### By primary type

In [None]:
# All releases, includes reprints - Double check
# Sort properly
release_primary_type = full_df.groupby(['Primary type','Release'])['Name'].nunique().unstack(0).sort_index().fillna(0).astype(int)
release_primary_type.groupby(release_primary_type.index.strftime('%Y')).sum().T

In [None]:
# primary_type_colors = [colors_dict[col] for col in release_primary_type.columns]
# rate_subplots(release_primary_type, colors=primary_type_colors, bg=anime_df, vlines=rules_df['begin'])

In [None]:
primary_type_colors = [colors_dict[col] for col in release_primary_type.columns]
rate_plot(release_primary_type, colors=primary_type_colors, bg=anime_df, vlines = rules_df['begin'])

### By secondary type

In [None]:
# All releases, includes reprints - Double check
# Sort properly
release_secondary_type = full_df.groupby(['Secondary type','Release'])['Name'].nunique().unstack(0).sort_index().fillna(0).astype(int)
release_secondary_type.groupby(release_secondary_type.index.strftime('%Y')).sum().T

In [None]:
# rate_subplots(release_secondary_type, bg=anime_df, vlines = rules_df['begin'])

In [None]:
rate_plot(release_secondary_type, bg=anime_df, vlines = rules_df['begin'])

### By attribute

In [None]:
# All releases, includes reprints - Double check
# Sort properly
release_attribute = full_df.groupby(['Attribute','Release'])['Name'].nunique().unstack(0).sort_index().fillna(0).astype(int)
release_attribute.groupby(release_attribute.index.strftime('%Y')).sum().T

In [None]:
# attribute_colors = [colors_dict[col] for col in release_attribute.columns]
# rate_subplots(release_attribute, colors=attribute_colors, bg=anime_df, vlines = rules_df['begin'])

In [None]:
attribute_colors = [colors_dict[col] for col in release_attribute.columns]
rate_plot(release_attribute, colors = attribute_colors, bg=anime_df, vlines = rules_df['begin'], cumsum=True)

### By monster type

In [None]:
# All releases, includes reprints - Double check
# Sort properly
release_monster_type = full_df.groupby(['Monster type','Release'])['Name'].nunique().unstack(0).sort_index().fillna(0).astype(int)
release_monster_type.groupby(release_monster_type.index.strftime('%Y')).sum().T

In [None]:
rate_subplots(release_monster_type, bg=anime_df, vlines = rules_df['begin'])

### By Level/Rank

In [None]:
# Testing
def boxplot(df):
    col = df.columns[0]
    df['Year'] = df.index.strftime('%Y')
    df.dropna(inplace=True)

    ax = df.boxplot(by='Year', figsize=(16,10))

    mean = df.groupby('Year').mean()

    ax.plot(list(range(1,len(mean.index)+1)), mean.values, c='r', ls='--', alpha=0.5)
    
    ticks = np.arange(0,df[col].max()+1,1)
    if len(ticks)>15:
        ax.yaxis.set_major_locator(MaxNLocator(11, integer=True))
        ax.yaxis.set_minor_locator(AutoMinorLocator())
    else:
        ax.yaxis.set_major_locator(FixedLocator(ticks))
        
    # ax.set_ylim([0,5000])
    plt.tight_layout()
    plt.show()

In [None]:
level_box = pd.to_numeric(full_df.set_index('Release')['Level/Rank'], errors='coerce').to_frame().sort_index().dropna()
boxplot(level_box)

### By ATK

In [None]:
atk_box = pd.to_numeric(full_df.set_index('Release')['ATK'], errors='coerce').to_frame().sort_index().dropna()
boxplot(atk_box)

### By DEF

In [None]:
def_box = pd.to_numeric(full_df.set_index('Release')['DEF'], errors='coerce').to_frame().sort_index().dropna()
boxplot(def_box)

### By pendulum scale

In [None]:
pendulum_box = pd.to_numeric(full_df.set_index('Release')['Pendulum Scale'], errors='coerce').to_frame().sort_index().dropna()
boxplot(pendulum_box)

### By link

In [None]:
link_box = pd.to_numeric(full_df.set_index('Release')['Link'], errors='coerce').to_frame().sort_index().dropna()
boxplot(link_box)

# Debug

## Merge failed

In [None]:
full_df['_merge'].value_counts()

In [None]:
full_df.where(full_df['_merge']=='right_only').dropna(axis=0,how='all')

In [None]:
full_df.where(full_df['_merge']=='left_only').dropna(axis=0,how='all')

 # Epilogue

In [None]:
footer()

## HTML export

In [None]:
# May need to sleep for a few seconds after saving
save_notebook()

In [None]:
! jupyter nbconvert Timeline.ipynb --output-dir='../' --to=HTML --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags='exclude' --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True

## Git

In [None]:
! git add "../*[Tt]imeline*" "../Data/Combined_data*"

In [None]:
! git commit -m {"'Timeline update-" + pd.Timestamp.now().strftime("%d%m%Y")+"'"}