In [1]:
# Setup - copy-pasted from 2A

# Python ≥3.9 is required
import sys
assert sys.version_info >= (3, 9)

# Scikit-Learn ≥1.0 is required
import sklearn
assert sklearn.__version__ >= "1.0"

# Common imports
import numpy as np
import pandas as pd

# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Precision options
np.set_printoptions(precision=2)
pd.options.display.float_format = '{:.3f}'.format

# Statistical analysis and testing
from statsmodels.formula.api import ols
import statsmodels.api as sm

# Ignore useless warnings (see SciPy issue #5998 and seaborn/matplotlib bug)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action="ignore", message="The figure layout has changed to tight")

import json
import datetime

In [2]:
f=open("default-cards-20240806090619.json")
raw_data = json.load(f)
# raw_data[0]

In [3]:
todays_date = datetime.datetime(year=2024,month=8,day=6)

Features the program gets NOTHING out of:
    "object", 
    "oracle_id", 
    "arena_id", 
    "multiverse_ids", 
    "tcgplayer_id", 
    "cardmarket_id", 
    "uri", 
    "scryfall_uri", 
    "oracle_text",
    "image_status",
    "image_uris",
    "set_name",
    "set_uri",
    "set_search_uri",
    "scryfall_set_uri",
    "rulings_uri",
    "prints_search_uri",
    "preview",
    "related_uris",
    "purchase_uris",
    "flavor_text",
    "usd",
    "usd_foil",
    "usd_etched",
    "eur",
    "eur_foil",
    "highres_image"
\+ maybe some more

Ordinal features I want to give the program:
Rarity

Categorical features I want to give the program:
Colors





There are some features that ML won't make head or tail of (eg. various URIs, oracle_text)

Some features that I deliberately don't want to give to the program (eg. card prices in paper)

Some features I'm interested in from a data analysis perspective, but will very likely be useless for determining price (eg. )

In [4]:
df = pd.DataFrame.from_dict(raw_data)

In [5]:
# binary_formats = ["legacy","vintage","penny","commander"]
# ban_ternary_formats = ["future","standard","pioneer","modern","pauper","paupercommander"] #just ignore restricted 
# restrict



Legalities:

Some formats are simple binaries. These are:
- vintage - legal or restricted
- legacy - legal or banned
- penny - legal or not_legal
- commander - legal or banned

(((((Some formats are *effectively* binaries. These are:
- duel - legal or banned (ignore restricted, it's just "is this creature legendary))))))

The rest are more complicated. 

(not_legal, legal, banned) formats:
- future
- standard
- pioneer
- modern
- pauper

(not_legal, legal, restricted) formats:


These need to have two variables:
Binary:
- printed_into_{format} -> = not({format}="not_legal")

Then "ternary" - 0, 1 or NaN (use a mean imputer when doing ML)
- banned_in_{format} -> (banned/restricted)=1, legal=0, not_legal=NaN

4 categories:
paupercommander -> (not_legal, legal, restricted, banned)
Just ignore restricted (it's just the same as "is an uncommon creature")

I think I need to take a step back and ignore legality for a bit 

In [6]:
# df.legalities.iloc[0]

In [7]:
# def engineer_legality(format: pd.Series) -> pd.DataFrame:
    
#     engineered_data = data.copy()

#     legalities_df = pd.DataFrame(engineered_data["legalities"].tolist())
    
    

#     engineered_data.

#     return engineered_data

# Description of features
### Describe every feature in detail

DATA CLEANING

Decide to only care about MTGO prices etc

"object" is never anything except "card" - useless
Remove all Un-set cards I think

Target variable: "prices" -> "tix" 
(MTGO economy uses tix as a currency - 1 tix = 1 US dollar)
Remove all prices except for tix ()

Variables to keep:
MTGO ID 
(Name? I want it for readability but it's useless for the algorithm)


Data cleaning:
Remove anything where "mtgo" not in "games" (ie. Alchemy cards)
Remove Un-set cards (I don't wanna deal with CMC 0.5 or 1,000,000)
Potentially remove P9 as outliers?




Problems for data representation: 
Unpack legalities and prices
(Possibly don't give it legality in "penny" though - dead giveaway!)
Keywords: One hot encoding again but that's gonna get hella annoying - consider just teaching it the most common ones?
Prices: Just keep "tix" (if model is very very shit, consider letting it have other prices lol)

Categorical things to one-hot encode, in order of probable utility to the algorithm:
Format legality - Legal, Not Legal, Banned, Restricted (maybe remove "penny" if I'm feeling mean, but start by not doing that)
Rarity - common, uncommon, rare, mythic, special, bonus (note: bonus is literally just the power 9!)
(Don't bother teaching it colours I think)
Keywords


Problems to solve for the algorithm:

I'd like it to be able to understand keywords - ie. if a card has flying, I want the algorithm to take that into account. ATM keywords are stored in a list - hard for ML to understand!


In [8]:
# Order of operations:
# 1: expand important dict features to columns
# 2: remove outlier rows
# 3: remove junk columns
# 4. onehot-encode things I want (eg. keywords)

In [9]:
def expand_dict_features(data: pd.DataFrame) -> pd.DataFrame:
    engineered_data = data.copy()
    legalities_df = pd.DataFrame(engineered_data["legalities"].tolist())
    prices_df = pd.DataFrame(engineered_data["prices"].tolist())
    engineered_data = engineered_data.drop(columns=["legalities","prices"])
    engineered_data = engineered_data.join(legalities_df)
    engineered_data = engineered_data.join(prices_df)
    return engineered_data

In [10]:
def drop_junk_features(data: pd.DataFrame) -> pd.DataFrame:
    junk_columns = [
    # These provide no info - the entire dataset has "card", "en", and null respectively
    "object",
    "lang",
    "attraction_lights",


    # Others that need to gtfo:
    "games",


    # Various IDs
    "oracle_id", 
    "arena_id", 
    "multiverse_ids", 
    "tcgplayer_id", 
    "cardmarket_id",
    "artist_ids",
    "illustration_id", 
    "tcgplayer_etched_id",
    "mtgo_foil_id",


    # Various URIs 
    "uri", 
    "scryfall_uri", 
    "image_status",
    "image_uris",
    "set_name",
    "set_uri",
    "set_search_uri",
    "scryfall_set_uri",
    "rulings_uri",
    "prints_search_uri",
    "related_uris",
    "purchase_uris",
    "card_back_id",


    # Random text-based stuff the algo won't understand
    "oracle_text",
    "flavor_text",
    

    # Misc other stuff irrelevant to data analysis
    "preview",
    "highres_image", 


    # Stuff with <1000 datapoints
    "card_faces",
    "loyalty",          # Tempted to keep this one, but I'll be consistent for now
    "printed_name",
    "flavor_name",
    "color_indicator",
    "printed_type_line",
    "printed_text",
    "variation_of",
    "life_modifier",
    "hand_modifier",
    "content_warning",


    # Misc other stuff I don't care about enough to want to bother with (at least for now)
    #"lang",
    #"artist",
    #"frame",
    #"border_color",
    #"promo",
    #"promo_types",
    #"finishes",
    #"foil",
    #"nonfoil",
    #"set_type",
    #"oversized",
    #"collector_number",
    #"full_art",
    #"textless",
    #"booster",
    #"story_spotlight",
    
    # Maybe come back to these at some point
    #"colors",
    #"color_identity",


    # Prices other than tix (I don't want the program to have these)
    "usd",
    "usd_foil",
    "usd_etched",
    "eur",
    "eur_foil",
    

    # Arena-specific format legalities (irrelevant to MTGO)
    "brawl",
    "historic",
    "standardbrawl",
    "alchemy",
    "explorer", 
    "gladiator",
    "timeless",

    # Other format legalities that no one gives a shit about
    #"future",
    #"paupercommander",
    #"duel",
    #"oldschool",
    #"premodern",
    #"predh",

    # 
]
    engineered_data = data.copy()
    engineered_data = engineered_data.drop(columns=junk_columns)
    return engineered_data


All variables I want to one-hot encode:

- Types
- Keywords
- Set code
- Legalities (one for each format GUHHHH)
- Border color

All variables needing imputation:

- Power
- Toughness
- Mana cost (for transform cards lol)

In [11]:
df2 = expand_dict_features(df)
df3 = drop_junk_features(df2)

In [12]:
#df3[df3.vintage == "not_legal"].name

In [13]:
#df3.mtgo_id

In [14]:
#df3.keywords

In [15]:
#df4.to_clipboard()

In [16]:
#df4[df4.produced_mana.notnull()]

In [17]:
df.head(1)

Unnamed: 0,object,id,oracle_id,multiverse_ids,mtgo_id,arena_id,tcgplayer_id,name,lang,released_at,...,tcgplayer_etched_id,flavor_name,attraction_lights,color_indicator,printed_type_line,printed_text,variation_of,life_modifier,hand_modifier,content_warning
0,card,0000419b-0bba-4488-8f7a-6194544ce91e,b34bb2dc-c1af-4d77-b0b3-a0fb342a5fc6,[668564],129825.0,91829.0,558404.0,Forest,en,2024-08-02,...,,,,,,,,,,


In [18]:
def clean_dataset(data: pd.DataFrame) -> pd.DataFrame:
    # I want to only keep cards that are relevant to actual play of the game, on MTGO
    # Therefore a) remove any card not on MTGO (eg. alchemy-only cards, Fallout cards etc)
    # ... and also need to remove cards which are on mtgo but inexplicably don't have a tix value stored! 

    # b) remove anything "banned" in Vintage (ie. ante, conspiracy, dexterity, subgame cards)
    # ...actually those cards aren't on MTGO anyway! -> unnecessary
    
    engineered_data = data.copy()
    #engineered_data = engineered_data.dropna()
    engineered_data = engineered_data[engineered_data.tix.notna()]
    engineered_data = engineered_data[engineered_data.vintage != "not_legal"]
    engineered_data = engineered_data[engineered_data.vintage != "banned"]
    #engineered_data = engineered_data[engineered_data.colors.notna()] # Temporary fix, removes all DFCs, replace and impute them at some point
    return engineered_data

In [19]:
df4 = clean_dataset(df3)
#df4.info()

In [20]:
def engineer_legalities(data: pd.DataFrame):
    formats = ['standard', 'future', 'pioneer', 'modern', 'legacy', 'pauper', 'vintage', 'penny', 
           'commander', 'oathbreaker', 'paupercommander', 'duel', 'oldschool', 'premodern', 'predh',]
    engineered_data = data.copy()
    text_columns = engineered_data.loc[:,formats]
    bin_columns = (text_columns=="legal").astype("int")
    return engineered_data.drop(columns=formats).join(bin_columns)

In [21]:
df5 = engineer_legalities(df4)
df5

Unnamed: 0,id,mtgo_id,name,released_at,layout,mana_cost,cmc,type_line,colors,color_identity,...,pauper,vintage,penny,commander,oathbreaker,paupercommander,duel,oldschool,premodern,predh
0,0000419b-0bba-4488-8f7a-6194544ce91e,129825.000,Forest,2024-08-02,normal,,0.000,Basic Land — Forest,[],[G],...,1,1,1,1,1,1,1,0,1,1
1,0000579f-7b35-4ed3-b44c-db2a538066fe,25527.000,Fury Sliver,2006-10-06,normal,{5}{R},6.000,Creature — Sliver,[R],[R],...,0,1,0,1,1,0,1,0,0,1
2,00006596-1166-4a79-8443-ca9f82e6db4e,34586.000,Kor Outfitter,2009-10-02,normal,{W}{W},2.000,Creature — Kor Soldier,[W],[W],...,1,1,1,1,1,1,1,0,0,1
4,0000cd57-91fe-411f-b798-646e965eec37,65170.000,Siren Lookout,2017-09-29,normal,{2}{U},3.000,Creature — Siren Pirate,[U],[U],...,1,1,1,1,1,1,1,0,0,0
7,0001e77a-7fff-49d2-a55c-42f6fdf6db08,116428.000,Obyra's Attendants // Desperate Parry,2023-09-08,adventure,{4}{U} // {1}{U},5.000,Creature — Faerie Wizard // Instant — Adventure,[U],[U],...,1,1,0,1,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97263,fffaa737-ce46-4f35-aa8c-bd9bb77ed9f6,57710.000,Angel's Tomb,2015-07-17,normal,{3},3.000,Artifact,[],[],...,0,1,1,1,1,0,1,0,0,0
97266,fffc85fb-1a40-4f83-a36e-cec0b7be658a,127441.000,Aggressive Biomancy,2024-06-14,normal,{X}{X}{G}{U},2.000,Sorcery,"[G, U]","[G, U]",...,0,1,0,1,1,0,1,0,0,0
97268,fffce2f7-b619-4483-a75e-916343194641,19591.000,Horned Troll,2003-07-28,normal,{2}{G},3.000,Creature — Troll,[G],[G],...,1,1,1,1,1,1,1,0,1,1
97269,fffdf7f3-a230-417a-883a-069aabcbcca7,117284.000,Faerie Bladecrafter,2023-09-08,normal,{2}{B},3.000,Creature — Faerie Rogue,[B],[B],...,0,1,0,1,1,0,1,0,0,0


In [22]:
from datetime import date
def card_age(data: pd.DataFrame) -> pd.DataFrame:
    # This is bugged

    engineered_data = data.copy()
    release_date_obj = engineered_data.released_at
    release_date = pd.to_datetime(release_date_obj)
    days_ago = (todays_date - release_date[:])
    int_days_ago = days_ago.map(datetime.datetime.date)
    #return engineered_data.drop(released_at).join(release_date)
    return int_days_ago

In [23]:
def abnormal_layout(data: pd.DataFrame) -> pd.DataFrame:
    # This is bugged

    engineered_data = data.copy()
    layout = engineered_data.layout
    is_abnormal = (layout[:]!="normal").astype("int")
    return engineered_data.drop(layout).join(is_abnormal)


In [24]:
def ordinate_rarities(data: pd.DataFrame) -> pd.DataFrame:
    ordinate_rarities_dict = {
        "common": 1, 
        "uncommon": 2,
        "rare": 3, 
        "mythic": 4, 
        "special": 5, 
        "bonus": 5, 
    }
    engineered_data = data.copy()
    ordinate_rarity = [ordinate_rarities_dict[x] for x in engineered_data["rarity"]]
    engineered_data.drop(columns="rarity").join(ordinate_rarity)
    return engineered_data
    #ordinate_rarity = ordinate_rarities_dict[tuple(engineered_data["rarity"][:30])[1]]
    #print(ordinate_rarity)

In [25]:
df6 = ordinate_rarities(df5)
#df6 = abnormal_layout(df5)
#df7 = card_age(df5)
#df7

AttributeError: 'int' object has no attribute 'index'

In [None]:
df5.head(1)

Unnamed: 0,id,mtgo_id,name,released_at,layout,mana_cost,cmc,type_line,colors,color_identity,...,pauper,vintage,penny,commander,oathbreaker,paupercommander,duel,oldschool,premodern,predh
0,0000419b-0bba-4488-8f7a-6194544ce91e,129825.0,Forest,2024-08-02,normal,,0.0,Basic Land — Forest,[],[G],...,1,1,1,1,1,1,1,0,1,1


In [None]:
df5.dtypes

id                   object
mtgo_id             float64
name                 object
released_at          object
layout               object
mana_cost            object
cmc                 float64
type_line            object
colors               object
color_identity       object
keywords             object
produced_mana        object
reserved               bool
foil                   bool
nonfoil                bool
finishes             object
oversized              bool
promo                  bool
reprint                bool
variation              bool
set_id               object
set                  object
set_type             object
collector_number     object
digital                bool
rarity               object
artist               object
border_color         object
frame                object
full_art               bool
textless               bool
booster                bool
story_spotlight        bool
power                object
toughness            object
edhrec_rank         

In [None]:
#df4[df4.vintage == "not_legal"]

In [None]:
#from sklearn.preprocessing import OneHotEncoder
def expand_typeline(data: pd.DataFrame) -> pd.DataFrame:
   engineered_data = data.copy()
   relevant_types = ["Legendary","Basic","Creature","Artifact","Enchantment","Instant","Sorcery","Land","Planeswalker"]
   type_lines = engineered_data.type_line
   no_subtypes = type_lines.str.split(" — ")[:][0]
   #[:].split("—")[0]
   print(no_subtypes)
   #type_lists = type_lines.str.split(" ").astype(set)
   #for type in relevant_types:
   #   is_this_type = 
      

   #print(type_lists)

   #print(engineered_data.info())
   pass

In [None]:
expand_typeline(df5)

['Basic Land', 'Forest']


In [None]:
#df5["type_line"].str.split(" ")

In [None]:
#df5=expand_typeline(df4)
#df5.info()

In [None]:
#set(list(df4.reserved))

In [None]:
df5[np.logical_not(df5.colors.notnull())]  # 304 of these
# df4[np.logical_or((df4.layout=="modal_dfc"),(df4.layout=="transform"))]   also 304 of these!

# They do still have tix values but don't have various datapoints eg. mana_cost, 

#Of these, first one is 139 - archangel avacyn / avacyn the purifier

Unnamed: 0,id,mtgo_id,name,released_at,layout,mana_cost,cmc,type_line,colors,color_identity,...,vintage,penny,commander,oathbreaker,paupercommander,duel,oldschool,premodern,predh,tix
70,00255899-aaaf-46c6-8037-bd0e3c06250c,110392.000,Invasion of Tolvada // The Broken Sky,2023-04-21,transform,,5.000,Battle — Siege // Enchantment,,"[B, W]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.02
102,003b8c93-54d2-4f23-961e-a52d63d0a54b,97897.000,The Restoration of Eiganjo // Architect of Res...,2022-03-15,transform,,3.000,Enchantment — Saga // Enchantment Creature — F...,,[W],...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.59
118,004524bf-b249-4dac-9c10-44d57143feb9,118200.000,"Growing Rites of Itlimoc // Itlimoc, Cradle of...",2023-11-17,transform,,3.000,Legendary Enchantment // Legendary Land,,[G],...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.02
145,00538777-8230-42bf-a6ea-adbe430d6b18,90228.000,"Shaile, Dean of Radiance // Embrose, Dean of S...",2021-05-01,modal_dfc,,2.000,Legendary Creature — Bird Cleric // Legendary ...,,"[B, W]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.03
524,014027c4-7f9d-4096-b308-ea4be574c0d4,83025.000,Skyclave Cleric // Skyclave Basilica,2020-09-25,modal_dfc,,2.000,Creature — Kor Cleric // Land,,[W],...,legal,not_legal,legal,legal,restricted,legal,not_legal,not_legal,not_legal,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96748,fe9f85e8-1e4c-4478-8344-71fae357b694,59780.000,Breakneck Rider // Neck Breaker,2016-04-08,transform,,3.000,Creature — Human Scout Werewolf // Creature — ...,,[R],...,legal,not_legal,legal,legal,restricted,legal,not_legal,not_legal,not_legal,0.03
96896,ff0063da-ab6b-499d-8e87-8b34d46f0372,58056.000,"Nissa, Vastwood Seer // Nissa, Sage Animist",2015-07-17,transform,,3.000,Legendary Creature — Elf Scout // Legendary Pl...,,[G],...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.02
96993,ff47f351-8e1a-4b39-9598-fe4230b14c15,95582.000,Distracting Geist // Clever Distraction,2022-01-28,transform,,3.000,Creature — Spirit // Enchantment — Aura,,[W],...,legal,not_legal,legal,legal,restricted,legal,not_legal,not_legal,not_legal,0.04
97050,ff66ee02-6b68-405e-a5c0-f59bb8020865,94214.000,"Dorothea, Vengeful Victim // Dorothea's Retrib...",2021-11-19,transform,,2.000,Legendary Creature — Spirit // Enchantment — Aura,,"[U, W]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.02


In [None]:
set(list(df5.rarity))

{'bonus', 'common', 'mythic', 'rare', 'special', 'uncommon'}

In [None]:
df5[df5.rarity=="mythic"]

Unnamed: 0,id,mtgo_id,name,released_at,layout,mana_cost,cmc,type_line,colors,color_identity,...,vintage,penny,commander,oathbreaker,paupercommander,duel,oldschool,premodern,predh,tix
46,0013a9c4-77a1-418d-85c2-bd68b65cd3d4,90761.000,"Dakkon, Shadow Slayer",2021-06-18,normal,{W}{U}{B},3.000,Legendary Planeswalker — Dakkon,"[B, U, W]","[B, U, W]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.02
63,0020a124-ba76-4d40-84e9-9803268d9f16,59563.000,World Breaker,2016-01-22,normal,{6}{G},7.000,Creature — Eldrazi,[],[G],...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,2.05
98,0038ea4d-d0a6-44a4-bee6-24c03313d2bc,63381.000,Sphinx's Revelation,2017-03-17,normal,{X}{W}{U}{U},3.000,Instant,"[U, W]","[U, W]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.09
130,004bbc84-b57b-4393-889e-dff95e8f334c,125841.000,Birthing Ritual,2024-06-14,normal,{1}{G},2.000,Enchantment,[G],[G],...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,1.98
177,0063654b-6e4e-4958-a63c-24cf933e4a40,121396.000,"Delney, Streetwise Lookout",2024-02-09,normal,{2}{W},3.000,Legendary Creature — Human Scout,[W],[W],...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97181,ffc0109c-f939-4424-820e-d6e60cacd794,126221.000,Necrodominance,2024-06-14,normal,{B}{B}{B},3.000,Legendary Enchantment,[B],[B],...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,2.43
97189,ffc70b2d-5a3a-49ea-97db-175a62248302,129675.000,"Glarb, Calamity's Augur",2024-08-02,normal,{B}{G}{U},3.000,Legendary Creature — Frog Wizard Noble,"[B, G, U]","[B, G, U]",...,legal,not_legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,1.95
97212,ffd68fe1-5cfc-44cf-8dfe-3488278cdcef,120923.000,Master of Cruelties,2024-01-12,normal,{3}{B}{R},5.000,Creature — Demon,"[B, R]","[B, R]",...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.12
97220,ffde74fb-cc20-4d82-bf6a-b18081047b0a,113603.000,"Mikaeus, the Unhallowed",2023-08-04,normal,{3}{B}{B}{B},6.000,Legendary Creature — Zombie Cleric,[B],[B],...,legal,legal,legal,legal,not_legal,legal,not_legal,not_legal,not_legal,0.36


In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(categories='auto')
#train_data_cat = train_data[["zipcode_group"]]
#train_data_cat_1hot = cat_encoder.fit_transform(train_data_cat)
#train_data_cat_1hot

In [None]:
# Need to expand the dict features BEFORE using the transformer, because need to drop the "tix" column before transforming

In [None]:
# from sklearn.base import BaseEstimator, TransformerMixin

# class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):

#     def transform(self, data, labels=None) -> pd.DataFrame:
#         # let's make a copy of the original dataset
#         engineered_data = data.copy()

#         engineered_data = expand_dict_features(engineered_data) # type: ignore
#         engineered_data = drop_junk_features(engineered_data)
#         engineered_data = clean_dataset(engineered_data)
        


#         return engineered_data


In [None]:
#fe_trf = FeatureEngineeringTransformer()
#fe_trf.transform(df)

In [None]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46956 entries, 0 to 97270
Data columns (total 58 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                46956 non-null  object 
 1   mtgo_id           46956 non-null  float64
 2   name              46956 non-null  object 
 3   released_at       46956 non-null  object 
 4   layout            46956 non-null  object 
 5   mana_cost         46315 non-null  object 
 6   cmc               46950 non-null  float64
 7   type_line         46950 non-null  object 
 8   colors            46315 non-null  object 
 9   color_identity    46956 non-null  object 
 10  keywords          46956 non-null  object 
 11  produced_mana     6956 non-null   object 
 12  reserved          46956 non-null  bool   
 13  foil              46956 non-null  bool   
 14  nonfoil           46956 non-null  bool   
 15  finishes          46956 non-null  object 
 16  oversized         46956 non-null  bool   
 17

In [None]:
df["name"].iloc[0]

'Forest'

In [None]:
#df.penny_rank

In [None]:
# Data exploration

# Can't make StratifiedShuffleSplit work for now -> will just use df.sample

#df4.sample(3000, random_state=42)




# from sklearn.model_selection import StratifiedShuffleSplit
# splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in splitter.split(df4, df4.tix):
#     train_set = df4.loc[train_index]
#     test_set = df4.loc[test_index]

In [None]:
#df4.to_clipboard()

# Take a sample, then put it on clipboard