In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, make_scorer
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Define file paths
path_english_multilingual = '../dataset/data-en-hi-de-fr.csv'
path_spam_ham = '../dataset/spam-ham-dataset.csv'

# Load datasets
english_multilingual_data = pd.read_csv(path_english_multilingual)
spam_ham_data = pd.read_csv(path_spam_ham, encoding='ISO-8859-1')

# Preprocess the first dataset (data-en-hi-de-fr.csv)
english_data = english_multilingual_data[['labels', 'text']]
german_data = english_multilingual_data[['labels', 'text_de']]
german_data.columns = ['labels', 'text']  # Rename for consistency

# Concatenate English and German data
english_multilingual_data = pd.concat([english_data, german_data])

# Preprocess the second dataset (spam-ham-dataset.csv)
spam_ham_data = spam_ham_data[['v1', 'v2']]
spam_ham_data.columns = ['labels', 'text']

# Combine datasets
combined_data = pd.concat([english_multilingual_data, spam_ham_data])

# Clean the combined dataset
combined_data['labels'].fillna(combined_data['labels'].dropna().unique()[0], inplace=True)  # Fill NaN values
combined_data = combined_data.dropna(subset=['labels'])  # Drop rows with NaN in labels

# Remove duplicates
combined_data = combined_data.drop_duplicates(subset='text')

# Encode labels
label_encoder = LabelEncoder()
combined_data['labels'] = label_encoder.fit_transform(combined_data['labels'])

# Split data into train and test sets
X = combined_data['text']
y = combined_data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to features
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Define custom scoring function
def custom_score(y_true, y_pred):
    penalty_matrix = np.array([
        [0, 1],  # penalty for predicting ham as spam
        [3, 0]   # penalty for predicting spam as ham (3 times as severe)
    ])
    cm = np.zeros((2, 2))
    y_true = np.array(y_true)  # Ensure y_true is a numpy array
    y_pred = np.array(y_pred)  # Ensure y_pred is a numpy array
    for i in range(len(y_true)):
        cm[y_true[i], y_pred[i]] += 1
    return -np.sum(cm * penalty_matrix)

custom_scorer = make_scorer(custom_score, greater_is_better=False)

# Train RandomForest model
model = RandomForestClassifier()
model.fit(X_train_vec, y_train)

# Cross-validation with custom scorer
scores = cross_val_score(model, X_train_vec, y_train, cv=5, scoring=custom_scorer)
print(f"Custom Score: {-scores.mean()}")

# Evaluate model
y_pred = model.predict(X_test_vec)
y_pred_proba = model.predict_proba(X_test_vec)[:, 1]  # Probability of the positive class
f1 = f1_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_pred_proba)

print(f"F1 Score: {f1}")
print(f"AUC Score: {auc}")

# Function to classify new messages
def classify_messages(messages):
    messages_vec = vectorizer.transform(messages)
    predictions = model.predict(messages_vec)
    prediction_labels = label_encoder.inverse_transform(predictions)
    return list(zip(messages, prediction_labels))

# Test the function with a list of 10 messages (5 English, 5 German)
test_messages = [
    "Hey, how are you doing? Long time no see!",
    "You've won a free ticket to the Bahamas. Call now to claim!",
    "Let's catch up over coffee next week.",
    "Your account has been compromised. Please reset your password immediately.",
    "Don't forget to submit the report by Monday.",
    "Wie geht es dir? Lange nicht mehr gesehen!",
    "Sie haben einen kostenlosen Flug auf die Bahamas gewonnen. Rufen Sie jetzt an, um Ihren Gewinn zu beanspruchen!",
    "Lass uns nächste Woche auf einen Kaffee treffen.",
    "Ihr Konto wurde kompromittiert. Bitte setzen Sie sofort Ihr Passwort zurück.",
    "Vergessen Sie nicht, den Bericht bis Montag einzureichen."
]

classified_messages = classify_messages(test_messages)
for message, label in classified_messages:
    print(f"Message: {message}\nClassified as: {label}\n")

# Convert the RandomForest model to ONNX format
onnx_model = convert_sklearn(model, initial_types=[('float_input', FloatTensorType([None, X_train_vec.shape[1]]))])

# Save the ONNX model to a file
with open("random_forest_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())
