# **archive_deck_list**
---

<br><br><br><br>


## **Variable Inputs**

---


In [1]:
### ====================================
# input directory
input_dir_paper_decks = 'C:\\git\\mtg-proj\\data\\input\\helvault_csv'
input_dir_digital_decks = 'C:\\git\\mtg-proj\\data\\input\\moxfield_txt\\*'
input_deck_prefix = 'deck-' # USER INPUT - prefix string for file to be picked up by logic below

### ====================================
# output
output_dir_paper = 'C:\\git\\mtg-proj\\data\\output\\deck_lists\\paper'
output_dir_digital = 'C:\\git\\mtg-proj\\data\\output\\deck_lists\\digital'

<br><br><br><br>



## **Objectives**

---


-   Take deck lists in the .txt format from a specified directory and convert them to .csv files with 2 fields: quantity and name.
    -   In the Scryfall name format for MDFCs
    -   update paper decklist (active and archive)
        -   with logic for comparing formatted deck lists (as pandas dfs)

<br>

-   Inputs:
    -   Decklists from moxfield.com
        -   in the MTGO format of
            -   1 Black Ritual
            -   4 Forrest
            -   etc.

<br>

-   Outputs:
    -   deck lists in .csv format
        -   2 fields: quantity and name


<br><br>


### **Visualizing ETL pipeline (example)**


In [2]:
# # graphviz data pipeline example...
# import graphviz

# # Name of file and comment as arguments
# graphviz_file_name  = 'Example Pipeline'
# graphviz_comment    = 'Example Pipeline'
# pipeline_graph = graphviz.Digraph(graphviz_file_name, comment = graphviz_comment)

# # Definning alias and display name of each node
# pipeline_graph.node('1', '1 - Input')
# pipeline_graph.node('2', '2 - Cleaning')
# pipeline_graph.node('3', '3 - Formatting')
# pipeline_graph.node('4', '4 - Processing')
# pipeline_graph.node('5', '5 - Analysis')
# pipeline_graph.node('6', '6 - Outputs')

# # Defining connections between nodes
# pipeline_graph.edges(['12', '23', '34', '45'])
# pipeline_graph.edge('1', '6', constraint='false')

# # save file to path
# #pipeline_graph.render(directory='graphviz-output')

# # printrender of graph
# pipeline_graph

In [3]:
#stop

<br><br><br><br>

## **Imports / Environment Setup**

---


### **Imports and Settings**


In [4]:
# imports
# import project_path
import pandas as pd
import numpy as np
from fp_data_toolbox import eda, notifier
from datetime import date
import time

ts = time.time()
notifier.setup() # Enable for windows toast notifications on Jupyter cell complete

In [5]:
#stop

<br><br><br><br>

## **Variables**

---


In [6]:
# curr_dt = eda.get_curr_dt()
curr_dt = str(date.today())
curr_dt

csv_suffix=".csv"
txt_suffix=".txt"


<br><br><br><br>

## **Ingestion/Preprocessing**

---

### **Imports**

In [7]:
import os
import glob
from pathlib import Path

<br><br><br><br>

### **Paper deck list**

In [9]:
### [x] consume the file names from the input directory

### TODO [ ] def sort and reindex colums as a function

### ====================================
directory = input_dir_paper_decks
for filename in os.scandir(directory):
    if filename.is_file():
        ### [x] pull just the file name; i.e.: 'deck-Anowon-v1.00.csv'
        filename_str=filename.path.rsplit('\\', 1)[1] or filename.path 
        filename_str=filename_str.rsplit('.', 1)[0] or filename_str # find the filename 
        
        if filename.path.endswith(csv_suffix):
            if filename_str.startswith(input_deck_prefix):
                
                print('input.csv detected: '+filename.path)
                
                in_dir_stg=filename.path.rsplit('\\',2)[0] # [x] split this into the correct path string
                in_file_nm=filename.path.rsplit('\\',2)[2] # [x] split this into the correct path string
                deck_nm=filename.path.rsplit('\\',2)[2].rsplit(input_deck_prefix,2)[1].rsplit(csv_suffix,2)[0] # [x] split this into the correct path string
                
                ### [x] change the below naming convention to match the one setup in C:\git\mtg-proj\helvault_csv_inputs
                output_dir_paper_stg = output_dir_paper+'\\'+deck_nm
                output_path_paper = output_dir_paper_stg+'\\'+'deck-'+deck_nm+'.csv'
                arch_dir_paper = output_dir_paper+'\\'+deck_nm+'\\'+'.archive'+'\\'
                arch_path_paper = arch_dir_paper+'deck-'+deck_nm+'.'+curr_dt+'.csv'
                
                ### ingest .csv as df here
                df_stg=pd.read_csv(filename.path)
                # print(df_stg.head())
                
                ### ====================================
                ### [x] Data cleaning operations here
                
                df_stg=df_stg.sort_values(by=['name','quantity'],ignore_index=True)
                df=df_stg.reindex([
                    'quantity',
                    'name'
                ], axis=1)
                df=df.rename(columns={"quantity": "count", "name": "name"})
                ### [x] df should just be count and name Aafter this
                
                ### ====================================
                ### [x] check whether current file out already exists
                path = Path(output_path_paper)
                if path.is_file():
                    # print(f'The file {output_path_paper} exists')
                    
                    df_existing = pd.read_csv(output_path_paper, header=None)
                    df_existing = df_existing.rename(columns={0: "count", 1: "name"})
                    
                    ### [x] re-sort dfs by name, and reindex before comparing 
                    df_existing.sort_values(by=['name','count'],ignore_index=True)
                    df_existing=df_existing.reindex([
                        'count',
                        'name'
                    ], axis=1)

                    df.sort_values(by=['name'])
                    df=df.reindex([
                        'count',
                        'name'
                    ], axis=1)
                    
                    ### [x] merge dfs
                    df_merge = pd.merge(df, df_existing, on=['name'], how='outer')
                    # cleaning
                    df_merge['count_x'] = df_merge['count_x'].fillna(0)
                    df_merge['count_y'] = df_merge['count_y'].fillna(0)
                    df_merge = df_merge.rename(columns={
                        "count_x": "count_src", 
                        "count_y": "count_tgt"
                        })
                    
                    df_merge = df_merge.reindex([ # not sure if this is necessary
                        'name',
                        'count_src',
                        'count_tgt'
                    ], axis=1)
                    
                    # print(df_merge)
                    
                    ### [x] write logic for returning a boolean
                        ### [x] testing completed
                    countSrc=df_merge['count_src']
                    countTgt=df_merge['count_tgt']
                    
                    df_equals_bool = 0 in countSrc or 0 in countTgt
                    
                    if df_equals_bool: ### [x] if the two dfs equal eachother, escape operation
                        print("input == active; escaping this iteration")
                        continue
                    print("input <> active; writing new active and archive versions")
                    
                    ### [ ] setup path variables for differences df outputs
                    # changes_dir = ''
                    # changes_in_path = changes_dir+'\\'+''
                    # changes_out_path = changes_dir+'\\'+''
                    
                    ### [ ] logic for comparing df_merge and returning only the differences
                        ### in 2 separate dfs (in changes and out changes)
                    
                    # df_merge
                    
                    ### [ ] return input and output differences in separate dfs
                    
                    ### outputs
                        ### [x] main deck list record (output_path_paper)
                        ### [x] archive version (arch_path_paper)
                        ### [ ] in / out lists of differences between DFs (df_merge)
                    
                    # df.to_csv(changes_in_path,index=False)
                    # df.to_csv(changes_out_path,index=False)
                    
                    df.to_csv(output_path_paper,index=False)
                    df.to_csv(arch_path_paper,index=False)
                    ### adapt the belowus
                    ### [ ] moxfield output formatting (.txt with just name and count)
                    # df.to_csv(r'c:\data\pandas.txt', header=None, index=None, sep=' ', mode='a')
                
                else:
                    print(f'File **{output_path_paper}** does not exist')
                    
                    # print(output_path_paper)
                    # print(arch_path_paper)
                    
                    ### Create target directory & all intermediate directories if don't exists
                    if not os.path.exists(output_dir_paper):
                        os.makedirs(output_dir_paper)
                        print("Directory " , output_dir_paper ,  " Created ")
                    else:
                        print("Directory " , output_dir_paper ,  " already exists")

                    if not os.path.exists(arch_dir_paper):
                        os.makedirs(arch_dir_paper)
                        print("Directory " , arch_dir_paper ,  " Created ")
                    else:
                        print("Directory " , arch_dir_paper ,  " already exists")

                    ### output
                    df.to_csv(output_path_paper,index=False)
                    df.to_csv(arch_path_paper,index=False)

.gitkeep
bndr-main-Commanders
bndr-main-Mana Base
bndr-main-Unique & Misc
box-Bulk Legit Staples
box-Bulk Legit
box-Bulk Proxy
deck-Anowon
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Anowon.csv
input == active; escaping this iteration
deck-Beginner Yangling
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Beginner Yangling.csv
input == active; escaping this iteration
deck-Breena
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Breena.csv
input == active; escaping this iteration
deck-Galea
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Galea.csv
input == active; escaping this iteration
deck-Nghathrod
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Nghathrod.csv
input == active; escaping this iteration
deck-Obuun
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Obuun.csv
input == active; escaping this iteration
deck-Prosper
input.csv detected: C:\git\mtg-proj\helvault_csv_inputs\deck-Prosper.csv
input == acti

In [None]:
#stop

### **Digital deck list ingestion**

In [10]:
### [ ] update the below with new fields to be created on the output

### example of find .csv file names logic
# def find_csv_filenames( path_to_dir, suffix=".csv" ):
#     filenames = listdir(path_to_dir)
#     return [ filename for filename in filenames if filename.endswith( suffix ) ]

directory = input_dir_digital_decks
### ====================================
### looping + iterating on 'directory' for all deck list .txt files
listing = glob.glob(directory)
for foldername in listing:
    for filename in os.scandir(foldername):
        if filename.is_file():
            if filename.path.endswith('.txt'):
                print('txt search hit')                
                # ====================================
                filename_str = str(filename)
                deck_nm=filename.path.rsplit('\\',2)[1]
                
                output_dir_df=output_dir_digital+'\\'+deck_nm
                arch_dir_df=output_dir_df+'\\'+'.archive'

                ### [ ] change the below naming convention to match the one setup in C:\git\mtg-proj\helvault_csv_inputs
                output_path_df=output_dir_df+'\\'+'deck-'+deck_nm+'.csv'
                arch_path_df=arch_dir_df+'\\'+'deck-'+deck_nm+'-'+curr_dt+'.csv'
                ### ====================================
                # print('Deck list name: '+deck_nm)
                # print(output_path_df)
                # print(arch_path_df)
                ### ====================================

                # ### read in a text file with 'read_csv' below
                df = pd.read_csv(filename.path, sep='\t',header =None,names=['count_nm'])

                # ### Data cleaning operations here
                count_nm_s = pd.Series(df['count_nm'], index=df.index)
                split_df = count_nm_s.str.split(" ", expand=True, n=1) # split series on first space only and return as a df
                df_stg = split_df
                df_stg["count"]= split_df[0]
                df_stg["name"]= split_df[1]
                df = df_stg.drop(columns=[0,1])
                
                ### ====================================

                path = Path(output_path_df)
                if path.is_file():
                    print(f'File {output_path_df} exists')
                    
                    df_existing = pd.read_csv(output_path_df, header=None)
                    df_existing = df_existing.rename(columns={0: "count", 1: "name"})
                    
                    ### [x] re-sort dfs by name, and reindex before comparing 
                    df_existing.sort_values(by=['name'])
                    df_existing=df_existing.reindex([
                        'count',
                        'name'
                    ], axis=1)

                    df.sort_values(by=['name'])
                    df=df.reindex([
                        'count',
                        'name'
                    ], axis=1)
                    
                    ### [x] merge dfs
                    df_merge = pd.merge(df, df_existing, on=['name'], how='outer')
                    # cleaning
                    df_merge['count_x'] = df_merge['count_x'].fillna(0)
                    df_merge['count_y'] = df_merge['count_y'].fillna(0)
                    df_merge = df_merge.rename(columns={
                        "count_x": "count_src", 
                        "count_y": "count_tgt"
                        })
                    
                    df_merge = df_merge.reindex([
                        'name',
                        'count_src',
                        'count_tgt'
                    ], axis=1)
                    
                    ### [x] write logic for returning a boolean
                        ### [x] testing completed
                    countSrc=df_merge['count_src']
                    countTgt=df_merge['count_tgt']
                    
                    df_equals_bool = 0 in countSrc or 0 in countTgt
                    
                    if df_equals_bool: ### [x] if the two dfs equal eachother, escape operation
                        print("input == active; escaping this iteration")
                        continue
                    print("input <> active; writing new active and archive versions")

                    ### [ ] return input and output differences in separate dfs

                    ### outputs
                        ### [x] main deck list record (output_path_paper)
                        ### [x] archive version (arch_path_paper)
                        ### [ ] in / out lists of differences between DFs (df_merge)

                    df.to_csv(output_path_df,index=False)
                    df.to_csv(arch_path_df,index=False)
                    
                else:
                    print(f'File {output_path_df} does not exist')
                    
                    ### Create target directory & all intermediate directories if don't exists
                    if not os.path.exists(output_dir_df):
                        os.makedirs(output_dir_df)
                        print("Directory " , output_dir_df ,  " Created ")
                    else:
                        print("Directory " , output_dir_df ,  " already exists")

                    if not os.path.exists(arch_dir_df):
                        os.makedirs(arch_dir_df)
                        print("Directory " , arch_dir_df ,  " Created ")
                    else:
                        print("Directory " , arch_dir_df ,  " already exists")

                    ### output
                    df.to_csv(output_path_df,index=False)
                    df.to_csv(arch_path_df,index=False)

txt search hit
File C:\git\mtg-proj\deck_lists\digital\Pako\deck-Pako.csv exists
input == active; escaping this iteration


In [None]:
#stop

<br><br><br><br>