In [None]:
#Schritt 1: DF für F 1.1 - done
#Schritt 2: Identify y und y rows - done
#Schritt 3. Refeerence Tempalte DF 

In [9]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy.orm import sessionmaker
from database.models import * 
import numpy as np
from rapidfuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
#Schritt 1 
# Verbindung zur Datenbank herstellen
DATABASE_URL = "sqlite:///database.db"  # Pfad zur SQLite-Datenbank
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(bind=engine)
def get_entries_by_code(code_value="F 01.01"):
    
    session = SessionLocal()
    try:
        # Abfrage: SELECT * FROM Konzept WHERE code = 'ISFIN0000001'
        entries = session.query(MergedData).filter(MergedData.template_id == code_value).order_by(MergedData.id).all()
        data = []
        for entry in entries:
            row = entry.__dict__
            row.pop("_sa_instance_state", None)
            data.append(row)
        df = pd.DataFrame(data)
        return df
    finally:
        session.close()

pd.set_option('display.max_columns', None)
df_merge_f1_1 = get_entries_by_code()
#df_merge_f1_1

In [16]:
# 1. Fuzzy Similarity (Zeichen-basierter Vergleich)
df_merge_f1_1['fuzzy_similarity'] = df_merge_f1_1.apply(
    lambda row: fuzz.ratio(row['member_name'], row['y_axis_name']), axis=1
)

# 2. Token-basierte Similarity (z. B. mit token_sort_ratio)
df_merge_f1_1['token_similarity'] = df_merge_f1_1.apply(
    lambda row: fuzz.token_sort_ratio(row['member_name'], row['y_axis_name']), axis=1
)

# 3. TF-IDF-basierte Cosinus-Ähnlichkeit
def tfidf_cosine_similarity(s1, s2):
    try:
        # Vektorisieren beider Strings
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([s1, s2])
        # Cosinus-Ähnlichkeit zwischen den beiden TF-IDF-Vektoren
        cos_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])
        return cos_sim[0][0]*100
    except Exception as e:
        return np.nan

df_merge_f1_1['tfidf_similarity'] = df_merge_f1_1.apply(
    lambda row: tfidf_cosine_similarity(row['member_name'], row['y_axis_name']), axis=1
)

# 4. Semantische Similarity mit Sentence Transformers
# Lade ein vortrainiertes Modell (z.B. all-MiniLM-L6-v2)
model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_similarity(s1, s2):
    try:
        # Berechne die Embeddings der beiden Texte
        emb1 = model.encode(s1)
        emb2 = model.encode(s2)
        # Cosinus-Ähnlichkeit zwischen den Embeddings
        cos_sim = cosine_similarity([emb1], [emb2])
        return cos_sim[0][0]*100
    except Exception as e:
        return np.nan

df_merge_f1_1['semantic_similarity'] = df_merge_f1_1.apply(
    lambda row: semantic_similarity(row['member_name'], row['y_axis_name']), axis=1
)

# Ausgabe des DataFrames mit den neuen Similarity-Spalten
#print(df_merge_f1_1[['member_name', 'y_axis_name', 'fuzzy_similarity', 'token_similarity', 'tfidf_similarity', 'semantic_similarity']])

In [18]:
pd.set_option('display.max_rows', 1)
df_merge_f1_1[['konzept_code', 'member_name', 'y_axis_name', 'fuzzy_similarity', 'token_similarity', 'tfidf_similarity', 'semantic_similarity']]

Unnamed: 0,konzept_code,member_name,y_axis_name,fuzzy_similarity,token_similarity,tfidf_similarity,semantic_similarity
0,ISFIN1023000,Financial assets held for trading,Debt securities,29.166667,25.0,0.0,58.681846


In [19]:
# 1. Durchschnitt der vier Ähnlichkeitswerte pro Zeile berechnen
similarity_cols = ["fuzzy_similarity", "token_similarity", "tfidf_similarity", "semantic_similarity"]
df_merge_f1_1["avg_similarity"] = df_merge_f1_1[similarity_cols].mean(axis=1)

# 2. Neue Spalte Y_Winner initialisieren (zunächst mit NaN)
df_merge_f1_1["Y_Winner"] = np.nan

# 3. Pro konzept_code die Zeile mit dem höchsten avg_similarity ermitteln und Y_Winner setzen
for code, group in df_merge_f1_1.groupby("konzept_code"):
    # Index der Zeile mit dem höchsten Durchschnittswert in dieser Gruppe
    max_idx = group["avg_similarity"].idxmax()
    # In dieser Zeile wird in Y_Winner der Durchschnittswert geschrieben
    df_merge_f1_1.loc[max_idx, "Y_Winner"] = df_merge_f1_1.loc[max_idx, "avg_similarity"]

In [23]:
pd.set_option('display.max_rows', 4)
df_merge_f1_1[['konzept_code', 'member_name', 'y_axis_name', 'fuzzy_similarity', 'token_similarity', 'tfidf_similarity', 'semantic_similarity', 'Y_Winner']]

Unnamed: 0,konzept_code,member_name,y_axis_name,fuzzy_similarity,token_similarity,tfidf_similarity,semantic_similarity,Y_Winner
0,ISFIN1023000,Financial assets held for trading,Debt securities,29.166667,25.000000,0.0,58.681846,
1,ISFIN1023000,Assets,Debt securities,38.095238,38.095238,0.0,49.171042,
...,...,...,...,...,...,...,...,...
217,ISFIN0004387,Loans and advances,Loans and advances,100.000000,100.000000,100.0,100.000000,100.0
218,ISFIN0004387,Carrying amount [mi],Loans and advances,31.578947,31.578947,0.0,8.560728,


In [43]:
# Anzahl der eindeutigen konzept_code
unique_konzept_codes = df_merge_f1_1['konzept_code'].nunique()

# Anzahl der Zeilen, bei denen Y_Winner > 90 ist
y_winner_over_90 = df_merge_f1_1[df_merge_f1_1['Y_Winner'] > 90]

print("Anzahl unique konzept_code:", unique_konzept_codes)
print("Anzahl der Y_Winner über 90:", y_winner_over_90.shape[0])

y_winner_under_90 = df_merge_f1_1[df_merge_f1_1['Y_Winner'] < 90]
pd.set_option('display.max_rows', 1)
y_winner_under_90[['konzept_code', 'y_axis_rc_code', 'member_name', 'y_axis_name', 'Y_Winner']]

Anzahl unique konzept_code: 53
Anzahl der Y_Winner über 90: 46


Unnamed: 0,konzept_code,y_axis_rc_code,member_name,y_axis_name,Y_Winner
16,ISFIN1070000,240,Hedge accounting,Derivatives – Hedge accounting,72.336097


In [46]:
#Schritt 3
# Verbindung zur Datenbank herstellen
DATABASE_URL = "sqlite:///database.db"  # Pfad zur SQLite-Datenbank
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(bind=engine)

def get_entries_by_code():
    
    session = SessionLocal()
    try:
        
        entries = session.query(Template_Finrep).filter(
        Template_Finrep.axis == 'y',
        Template_Finrep.table == '1.1 Assets').all()
        #Template_Finrep.text == 'Cash on hand').all()
        
        data = []
        for entry in entries:
            row = entry.__dict__
            row.pop("_sa_instance_state", None)
            data.append(row)
        df = pd.DataFrame(data)
        return df
    finally:
        session.close()

pd.set_option('display.max_columns', None)
df_temp_ref = get_entries_by_code()
df_temp_ref.head(1)
df_temp_ref.shape

(37, 8)

In [49]:

merged_df = pd.merge(y_winner_over_90, df_temp_ref, left_on='y_axis_rc_code', right_on='coord', how='inner')
pd.set_option('display.max_rows', None)
#merged_df[['konzept_code', 'y_axis_rc_code', 'member_name', 'y_axis_name', 'Y_Winner', 'coord', 'reference', 'text']]
merged_df[['konzept_code', 'y_axis_rc_code', 'member_name', 'y_axis_name', 'Y_Winner', 'coord', 'reference', 'text']].to_csv('output.csv', index=False)
