In [3]:
import pandas as pd

# 1. Φόρτωση του CSV (άλλαξε τη διαδρομή αν χρειάζεται)
df = pd.read_csv('occurrences.csv', sep=';', low_memory=False)

# 2. Μετονομασία της στήλης περιγραφής σε 'report'
df = df.rename(columns={'Description': 'report'})

# 3. Category από Main_Event_L1
df['category'] = df['Main_Event_L1']

# 4. Severity map σε Low/Medium/High/ Critical
severity_map = {
    'Less Serious': 'Low',
    'Serious': 'Medium',
    'Very Serious': 'High',
    'Marine Incident': 'Critical'
}
df['severity'] = df['Occurrence_Severity'].map(severity_map)

# 5. Αποθήκευση σε νέο CSV με τις 3 στήλες
df[['report', 'category', 'severity']].to_csv('incidents_categorized.csv', index=False)

print("Prepared 3‑column CSV: incidents_categorized.csv")


Prepared 3‑column CSV: incidents_categorized.csv


In [4]:
# Πόσες διαφορετικές κατηγορίες προβλημάτων υπάρχουν;
print(df["category"].value_counts())
 
# Πόσες διαφορετικές σοβαρότητες υπάρχουν;
print(df["severity"].value_counts())


category
Accident to person(s)         1339
Damage / Loss Of Equipment    1095
Loss Of Control                557
Grounding / Stranding          377
Contact                        299
Collision                      254
Fire / Explosion               234
Flooding / Foundering          120
Capsizing / Listing             73
Non-accidental Event             3
Hull Failure                     2
Name: count, dtype: int64
severity
Critical    2480
Low         1491
Medium       209
High         174
Name: count, dtype: int64


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Φόρτωση
df = pd.read_csv('incidents_categorized.csv')
df = df.dropna(subset=['category'])


# Έλεγχος
print(f"Remaining rows: {len(df)}")


Remaining rows: 4353


In [6]:
# Έλεγχος υπολειπόμενων NaN
print("Missing in category:", df['category'].isna().sum())
print("Missing in severity:", df['severity'].isna().sum())


Missing in category: 0
Missing in severity: 0


In [7]:
import pandas as pd

# 1) Hull Failure → Damage / Loss Of Equipment
df['category'] = df['category'].replace(
    {'Hull Failure': 'Damage / Loss Of Equipment'}
)

# 2) Non‑accidental Event → Accident to person(s)
df['category'] = df['category'].replace(
    {'Non-accidental Event': 'Accident to person(s)'}
)

# Έλεγχος
print(df['category'].value_counts())


category
Accident to person(s)         1342
Damage / Loss Of Equipment    1097
Loss Of Control                557
Grounding / Stranding          377
Contact                        299
Collision                      254
Fire / Explosion               234
Flooding / Foundering          120
Capsizing / Listing             73
Name: count, dtype: int64


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Αφαίρεση placeholder reports
mask = df['report'].str.strip() != "See MAIB investigation report, when available"
df = df[mask]

# (προαιρετικά) Έλεγχος πόσες έμειναν
print("Rows after dropping placeholders:", len(df))


Rows after dropping placeholders: 4311


In [9]:

# 2) Stratified split
train, temp = train_test_split(df, test_size=0.30, stratify=df['category'], random_state=42)
val, test  = train_test_split(temp, test_size=0.50, stratify=temp['category'], random_state=42)

# 6) Αποθήκευση
train.to_csv('train.csv', index=False)
val.  to_csv('val.csv',   index=False)
test. to_csv('test.csv',  index=False)

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")


Train: 3017, Val: 647, Test: 647


# DATA AUGMENTATION

Extraction of the minority samples

In [10]:
train = pd.read_csv("train.csv")
counts = train['category'].value_counts()
# Μετά τη συγχώνευση, οι small classes είναι:
small_cats = counts[counts < 150].index.tolist()
# π.χ. ['Capsizing / Listing','Flooding / Foundering', 'Damage / Loss Of Equipment', ...]
minority_df = train[train['category'].isin(small_cats)]


AUGMENTATION IN MERGED CLASSES - METHODS: Back translation, Contextual substitution 

In [11]:
pip install sentencepiece


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\elena\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [12]:
from transformers import MarianMTModel, MarianTokenizer
import nlpaug.augmenter.word as naw

# Back-translation function (μέσω French)
def back_translate(text, src="en", mid="fr"):
    tok_fwd = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{src}-{mid}')
    mdl_fwd = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{src}-{mid}')
    fr = mdl_fwd.generate(**tok_fwd(text, return_tensors="pt", truncation=True))
    fr_text = tok_fwd.decode(fr[0], skip_special_tokens=True)
    tok_rev = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{mid}-{src}')
    mdl_rev = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{mid}-{src}')
    en = mdl_rev.generate(**tok_rev(fr_text, return_tensors="pt", truncation=True))
    return tok_rev.decode(en[0], skip_special_tokens=True)

# Contextual substitution augmenter
aug_ctx = naw.ContextualWordEmbsAug(
    model_path="distilbert-base-cased",
    action="substitute"
)

augmented = []
for _, row in minority_df.iterrows():
    txt = row['report']
    cat = row['category']
    sev = row['severity']
    # π.χ. δύο back-translations + δύο context subs
    augmented += [
        {"report": back_translate(txt, mid="fr"), "category": cat, "severity": sev},
        {"report": back_translate(txt, mid="de"), "category": cat, "severity": sev},
        {"report": aug_ctx.augment(txt),                "category": cat, "severity": sev},
        {"report": aug_ctx.augment(txt),                "category": cat, "severity": sev},
    ]

aug_df = pd.DataFrame(augmented)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support sym

Union of training datasets (old train with augmentated data) & retraining 

In [None]:
balanced_train = pd.concat([train, aug_df], ignore_index=True)
balanced_train = balanced_train.sample(frac=1, random_state=42)
balanced_train.to_csv("train_augmented.csv", index=False)


: 