In [6]:
# Checking import of preprocessed data

import pandas as pd
from pymongo import MongoClient
import tiktoken

# 1) Connect to MongoDB
client = MongoClient("mongodb://localhost:27018/")
db     = client["transcriptions"]
coll   = db["transcripts_denis"]

# 2) Define fields to check
fields = [
    "text_wer_denis", "src_wer_denis",
    "text_lex_denis", "src_lex_denis",
    "text_sem_denis", "src_sem_denis",
    "text_meer_denis", "src_meer_denis",
    "text_bleu_denis", "src_bleu_denis"
    
]

# 3) Fetch all documents
docs = list(coll.find({}))

# 4) Initialize tiktoken encoder (e.g. cl100k_base)
enc = tiktoken.get_encoding("cl100k_base")

# 5) Build DataFrame with presence and token counts
rows = []
for d in docs:
    row = {"_id": str(d["_id"])}
    for f in fields:
        text = d.get(f, "")
        tokens = enc.encode(text)
        row[f"{f}_present"] = bool(text)
        row[f"{f}_tokens"]  = len(tokens)
    rows.append(row)

df = pd.DataFrame(rows)

# 6) Check missing/empty fields
print("Missing or empty fields per column:")
for f in fields:
    miss = df[f"{f}_present"].value_counts().get(False, 0)
    print(f"  {f}: {miss} of {len(df)}")

# 7) Compute token statistics
stats = {}
for f in fields:
    col = df[f"{f}_tokens"]
    stats[f] = {
        "min":    int(col.min()),
        "median": float(col.median()),
        "mean":   float(col.mean()),
        "max":    int(col.max()),
        "std":    float(col.std())
    }

stats_df = pd.DataFrame(stats).T[["min","median","mean","max","std"]]
print("\nToken count statistics (via tiktoken):")
print(stats_df)

Missing or empty fields per column:
  text_wer_denis: 0 of 12000
  src_wer_denis: 0 of 12000
  text_lex_denis: 0 of 12000
  src_lex_denis: 0 of 12000
  text_sem_denis: 0 of 12000
  src_sem_denis: 0 of 12000
  text_meer_denis: 0 of 12000
  src_meer_denis: 0 of 12000
  text_bleu_denis: 120 of 12000
  src_bleu_denis: 120 of 12000

Token count statistics (via tiktoken):
                    min  median         mean     max         std
text_wer_denis    571.0  2701.0  2628.619333  6818.0  687.473138
src_wer_denis    1403.0  2736.5  2826.160000  5712.0  556.501574
text_lex_denis    372.0  1789.0  1745.736250  4637.0  450.458573
src_lex_denis     879.0  1836.5  1888.090000  3746.0  359.277968
text_sem_denis    558.0  2885.0  2778.028250  7617.0  791.064010
src_sem_denis    1531.0  2983.0  3070.540000  6308.0  621.029104
text_meer_denis   555.0  2844.0  2739.822250  7590.0  774.153313
src_meer_denis   1537.0  2951.5  3024.750000  6227.0  601.960733
text_bleu_denis     0.0  2885.0  2761.563333  

In [None]:
# 11) BLEU-Score Berechnung mit sacrebleu und Speichern
import pandas as pd
from pymongo import MongoClient
from sacrebleu import corpus_bleu
from tqdm import tqdm

def calculate_bleu_scores(lowercase: bool = True, tokenize: str = "13a"):
    """
    Liest die vorverarbeiteten Felder aus MongoDB,
    berechnet corpus-level BLEU nur für excludeGeneral=0 Dokumente
    und gibt ein DataFrame zurück.
    """
    client = MongoClient("mongodb://localhost:27018/")
    db     = client["transcriptions"]
    coll   = db["transcripts_denis"]

    hyps = []
    refs = []
    meta = []

    # Nur excludeGeneral=0
    for doc in tqdm(coll.find({"excludeGeneral": 0}), desc="BLEU Scoring"):
        hyp = doc.get("text_bleu_denis", "")
        ref = doc.get("src_bleu_denis", "")

        # Nur hinzufügen, wenn beide Felder nicht leer sind
        if not hyp or not ref:
            continue

        hyps.append(hyp)
        refs.append([ref])
        meta.append({
            "convoID":         doc.get("convoID"),
            "ambientVariant":  doc.get("ambientVariant"),
            "processedVolume": doc.get("processedVolume"),
            "technology":      doc.get("technology"),
            "model":           doc.get("model"),
        })

    # Corpus-level BLEU
    bleu = corpus_bleu(hyps, refs, lowercase=lowercase, tokenize=tokenize)

    df = pd.DataFrame(meta)
    df["bleu_score"] = bleu.score

    client.close()
    return df

if __name__ == "__main__":
    df_bleu = calculate_bleu_scores()
    output_path = "bleu_scores_full.csv"
    df_bleu.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"BLEU scores saved to {output_path}")


In [8]:
from pymongo import MongoClient

# Verbindung zur lokalen MongoDB
client = MongoClient("mongodb://localhost:27018/")
db     = client["transcriptions"]
coll   = db["transcripts_denis"]

# Query: fehlende oder leere BLEU-Felder
query = {
    "$or": [
        {"src_bleu_denis": {"$exists": False}},
        {"text_bleu_denis": {"$exists": False}},
        {"src_bleu_denis": ""},
        {"text_bleu_denis": ""}
    ]
}

# Alle betroffenen _id sammeln
missing_ids = []
for doc in coll.find(query, {"_id": 1}):
    missing_ids.append(doc["_id"])

# Ausgabe
print(f"Anzahl fehlender BLEU-Felder: {len(missing_ids)}")
for _id in missing_ids:
    print(_id)

client.close()


Anzahl fehlender BLEU-Felder: 120
6765ebada84bdac1bcd5aefa
6765ebada84bdac1bcd5aefb
6765ebada84bdac1bcd5aefc
6765ebada84bdac1bcd5aefd
6765ebada84bdac1bcd5aefe
6765ebada84bdac1bcd5aeff
6765ebada84bdac1bcd5af00
6765ebada84bdac1bcd5af01
6765ebada84bdac1bcd5af02
6765ebada84bdac1bcd5af03
6765ebada84bdac1bcd5af04
6765ebada84bdac1bcd5af05
6765ebada84bdac1bcd5af06
6765ebada84bdac1bcd5af07
6765ebada84bdac1bcd5af08
6765ebada84bdac1bcd5af09
6765ebada84bdac1bcd5af0a
6765ebada84bdac1bcd5af0b
6765ebada84bdac1bcd5af0c
6765ebada84bdac1bcd5af0d
6765ec9ba84bdac1bcd5b6ca
6765ec9ca84bdac1bcd5b6cb
6765ec9ca84bdac1bcd5b6cc
6765ec9ca84bdac1bcd5b6cd
6765ec9ca84bdac1bcd5b6ce
6765ec9ca84bdac1bcd5b6cf
6765ec9ca84bdac1bcd5b6d0
6765ec9ca84bdac1bcd5b6d1
6765ec9ca84bdac1bcd5b6d2
6765ec9ca84bdac1bcd5b6d3
6765ec9ca84bdac1bcd5b6d4
6765ec9ca84bdac1bcd5b6d5
6765ec9ca84bdac1bcd5b6d6
6765ec9ca84bdac1bcd5b6d7
6765ec9ca84bdac1bcd5b6d8
6765ec9ca84bdac1bcd5b6d9
6765ec9ca84bdac1bcd5b6da
6765ec9ca84bdac1bcd5b6db
6765ec9ca84bdac1