In [268]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [269]:
import cards_scraping
import data_cleaning

In [270]:
data_raw = cards_scraping.get_staples(cards_scraping.make_it_dataframe())
data, image_links = data_cleaning.wholesome_cleaning(data_raw)

data = data
image_links = image_links

  data.replace('null', np.nan, inplace = True)


In [271]:
def adapt_dtypes(df):
    '''
    Passt die Kategorien nochmal an
    '''
    data = df
    data['attribute'] = data['attribute'].astype('category')
    data['frameType'] = data['frameType'].astype('category')
    data['type'] = data['type'].astype('category')
    data['staple'] = data['staple'].astype(bool)
    data['is_pendulum'] = data['is_pendulum'].astype(bool)
    data['tcg_date'] = pd.to_datetime(data['tcg_date'], errors='coerce')

    return data

data_new = adapt_dtypes(data)

In [302]:
def monster_data(df):
    data = df.loc[df['frameType'].isin(['spell', 'trap']) == False]
    data = data.reset_index(drop=True)

    return data

In [303]:
def spell_trap_data(df):
    data = df.loc[df['frameType'].isin(['spell', 'trap'])]
    data = data.reset_index(drop=True)

    return data

In [304]:
data_monster = monster_data(data_new)
data_spell_trap = spell_trap_data(data_new)
data_spell_trap.head(3)

Unnamed: 0,id,name,type,desc,race,card_price,up_votes,down_votes,views,atk,def,level,attribute,archetype,linkval,scale,tcg_date,frameType,staple,is_pendulum
0,34541863,"""A"" Cell Breeding Device",Spell Card,"During each of your Standby Phases, put 1 A-Co...",Continuous,0.1,118,110,312103,-1.0,-1.0,-1.0,,Alien,0.0,-1.0,2007-05-16,spell,False,False
1,64163367,"""A"" Cell Incubator",Spell Card,Each time an A-Counter(s) is removed from play...,Continuous,0.13,25,20,219767,-1.0,-1.0,-1.0,,Alien,0.0,-1.0,2007-11-14,spell,False,False
2,91231901,"""A"" Cell Recombination Device",Spell Card,Target 1 face-up monster on the field; send 1 ...,Quick-Play,0.16,19,15,228104,-1.0,-1.0,-1.0,,Alien,0.0,-1.0,2016-11-03,spell,False,False


In [305]:
data_monster.head(3)

Unnamed: 0,id,name,type,desc,race,card_price,up_votes,down_votes,views,atk,def,level,attribute,archetype,linkval,scale,tcg_date,frameType,staple,is_pendulum
0,86988864,3-Hump Lacooda,Effect Monster,"If there are 3 face-up ""3-Hump Lacooda"" cards ...",Beast,0.11,23,10,194656,500.0,1500.0,3.0,EARTH,,0.0,-1.0,2004-06-01,effect,False,False
1,83994646,4-Starred Ladybug of Doom,Flip Effect Monster,FLIP: Destroy all Level 4 monsters your oppone...,Insect,0.07,19,9,427432,800.0,1200.0,3.0,WIND,,0.0,-1.0,2002-10-20,effect,False,False
2,23771716,7 Colored Fish,Normal Monster,A rare rainbow fish that has never been caught...,Fish,0.08,26,6,408513,1800.0,800.0,4.0,WATER,,0.0,-1.0,2002-06-26,normal,False,False


In [282]:
def knn_model_monster(df):
    '''
    Berechnet den KNN-Algorithmus für numerische Ähnlichkeit.
    '''
    numerical_features = ['atk', 'def', 'level', 'linkval', 'scale']
    categorical_features = ['race', 'attribute', 'frameType']

    df_numeric = df[numerical_features].fillna(0)
    df_categorical = pd.get_dummies(df[categorical_features])  # One-Hot-Encoding

    # Skalieren der numerischen Werte
    scaler = StandardScaler()
    df_numeric_scaled = scaler.fit_transform(df_numeric)

    # Kombinieren von numerischen und kategorischen Features
    df_features = np.hstack((df_numeric_scaled, df_categorical.values))

    # KNN-Modell für numerische Ähnlichkeit
    knn = NearestNeighbors(n_neighbors=len(df), metric="cosine")
    knn.fit(df_features)

    return knn, df_features


def knn_model_spell_trap(df):
    '''
    Berechnet den KNN-Algorithmus für numerische Ähnlichkeit.
    '''
    
    categorical_features = ['race', 'frameType']

    
    df_categorical = pd.get_dummies(df[categorical_features])  # One-Hot-Encoding

    # Skalieren der numerischen Werte
    scaler = StandardScaler()
    df_numeric_scaled = scaler.fit_transform(df_categorical)

    # Kombinieren von numerischen und kategorischen Features
    df_features = np.hstack((df_numeric_scaled, df_categorical.values))

    # KNN-Modell für numerische Ähnlichkeit
    knn = NearestNeighbors(n_neighbors=len(df), metric="cosine")
    knn.fit(df_features)

    return knn, df_features


In [306]:
knn, df_features = knn_model_spell_trap(spell_trap_data(data_new))
df_features

array([[-0.01487081,  1.89276873, -0.19273672, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01487081,  1.89276873, -0.19273672, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01487081, -0.52832656, -0.19273672, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.01487081,  1.89276873, -0.19273672, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01487081, -0.52832656, -0.19273672, ...,  0.        ,
         1.        ,  0.        ],
       [-0.01487081,  1.89276873, -0.19273672, ...,  0.        ,
         1.        ,  0.        ]])

In [307]:
xy_data = spell_trap_data(data_new)
card_index = xy_data[xy_data["name"] == 'Monster Reborn'].index[0]
card_index

2469

In [308]:
print(f"Anzahl der Karten in df_features: {df_features.shape[0]}")


Anzahl der Karten in df_features: 4523


In [309]:
distances, indices = knn.kneighbors(df_features[card_index:card_index + 1])

In [310]:
def effect_similarity(df):
    '''
    Berechnet die Textähnlichkeit basierend auf dem Effekttext der Karten.
    '''
    tfidf = TfidfVectorizer(stop_words="english")
    text_matrix = tfidf.fit_transform(df['desc'].fillna(""))  # Effekttext der Karten

    # Berechnet die Cosine Similarity für den Effekttext
    text_similarity = cosine_similarity(text_matrix)

    return text_similarity

In [323]:
def find_similar_cards(df, card_name, alpha=0.5):
    '''
    Berechnet für eine gegebene Karte die Ähnlichkeit zu allen anderen Karten im DataFrame
    unter Verwendung einer Kombination aus KNN und Textähnlichkeit.
    '''
    data = adapt_dtypes(df)
    
    
    
    frame_type_value = data.loc[data['name'] == card_name, 'frameType']
    frame_type = frame_type_value.iloc[0]

    if frame_type in ['spell', 'trap']:
            data = spell_trap_data(data)
    
    else:
         data = monster_data(data)

    # Überprüfe, ob die Karte im DataFrame existiert
    if card_name not in data["name"].values:
        raise ValueError(f"Karte '{card_name}' nicht gefunden!")

    # Holen der Indexposition der Karte
    card_index = data[data["name"] == card_name].index[0]

    # KNN für numerische Ähnlichkeit
    if frame_type in ['spell', 'trap']:
        knn, df_features = knn_model_spell_trap(data)
    
    else:
         knn, df_features = knn_model_monster(data)

    # Berechne numerische Ähnlichkeit
    distances, indices = knn.kneighbors(df_features[card_index:card_index + 1])
    numerical_similarity = 1 / (1 + distances[0])  # Umkehren der Distanz, damit höhere Werte für größere Ähnlichkeit stehen

    # Textähnlichkeit berechnen
    text_similarity = effect_similarity(data)
    text_sim_scores = text_similarity[card_index]

    # Kombiniere numerische Ähnlichkeit und Textähnlichkeit
    combined_similarity = alpha * numerical_similarity + (1 - alpha) * text_sim_scores[indices[0]]

    # Karten nach der kombinierten Ähnlichkeit sortieren (absteigend)
    sorted_indices = indices[0][np.argsort(combined_similarity)[::-1]]

    # Erstelle eine Liste mit den 5 ähnlichsten Karten
    similar_cards = data.iloc[sorted_indices][['name', 'frameType']]
    similar_cards = similar_cards[similar_cards['name'] != card_name].head(5)

    return similar_cards

In [324]:
find_similar_cards(data, "Archfiend Eccentrick", alpha=0.5)

Unnamed: 0,name,frameType
5526,Pendulum Witch,effect
1515,D/D Evil,effect
5599,Performapal Momoncarpet,effect
1522,D/D Orthros,effect
4900,"Moissa Knight, the Comet General",effect
