## Let's first fix card name issues. 
danielbrooks20@gmail.com  

In [1]:
import pandas as pd

import re
from typing import Dict

In [2]:
# Load draft data in string format. 

old_train_path = "Data/original_train.csv"
new_train_path = "Data/train.csv"

old_test_path = "Data/original_test.csv"
new_test_path = "Data/test.csv"

with open(old_train_path, "r") as f:
    train_text = f.read()
    
with open(old_test_path, "r") as f:
    test_text = f.read()

In [3]:
# Grab the m19 rating tsv from our repo for card names. 
# Get ready to remove commas. 
df_set = pd.read_csv("Data/m19_rating.tsv", delimiter = "\t")
bad_to_good_name = {row["Name"] : re.sub(",", "", row["Name"]) for i, row in df_set.iterrows() if "," in row["Name"]}

In [4]:
def fix_text(text: str, bad_to_good_name: Dict = bad_to_good_name) -> str:

    # Fix land names. 
    land_names = ["Forest", "Island", "Mountain", "Plains", "Swamp"]
    for name in land_names:
        text = re.sub(f"{name}_\d", name, text)

    # Fix names with commas.     
    for bad_name in bad_to_good_name:
        text = re.sub(bad_name, bad_to_good_name[bad_name], text)
    
    return text

In [5]:
train_text = fix_text(train_text)
test_text = fix_text(test_text)

In [6]:
with open(new_train_path, "w") as f:
    f.write(train_text)
with open(new_test_path, "w") as f:
    f.write(test_text)

## Create card_dict files
Create card_dict files needed to interact with loading codebase (on the new, fixed dataset).

I believe card_dict has the form ```{card_name (str) : card_number (int)}``` (or swapped keys and values). 

In [7]:
import pickle

In [8]:
import preprocess # Local file. 
card_dict = preprocess.create_dict("Data/train.csv")

In [9]:
# Write to pickle file. 
with open('Data/card_dict.pt', 'wb') as handle:
    pickle.dump(card_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
# Load card dict. 
with open('Data/card_dict.pt', 'rb') as handle:
    b = pickle.load(handle)
# print(b)