In [15]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Define file paths
path_english_multilingual = '/home/philipp/AndroidStudioProjects/AntiSMSScam/dataset/data-en-hi-de-fr.csv'
path_spam_ham = '/home/philipp/AndroidStudioProjects/AntiSMSScam/dataset/spam-ham-dataset.csv'

# Load datasets
english_multilingual_data = pd.read_csv(path_english_multilingual)
spam_ham_data = pd.read_csv(path_spam_ham, encoding='ISO-8859-1')

# Preprocess the first dataset (data-en-hi-de-fr.csv)
english_data = english_multilingual_data[['labels', 'text']]
german_data = english_multilingual_data[['labels', 'text_de']]
german_data.columns = ['labels', 'text']  # Rename for consistency

# Concatenate English and German data
english_multilingual_data = pd.concat([english_data, german_data])

# Preprocess the second dataset (spam-ham-dataset.csv)
spam_ham_data = spam_ham_data[['v1', 'v2']]
spam_ham_data.columns = ['labels', 'text']

# Combine datasets
combined_data = pd.concat([english_multilingual_data, spam_ham_data])

# Clean the combined dataset
combined_data['labels'].fillna(combined_data['labels'].dropna().unique()[0], inplace=True)  # Fill NaN values
combined_data = combined_data.dropna(subset=['labels'])  # Drop rows with NaN in labels

# Remove duplicates
combined_data = combined_data.drop_duplicates(subset='text')

# Encode labels
label_encoder = LabelEncoder()
combined_data['labels'] = label_encoder.fit_transform(combined_data['labels'])

# Split data into train and test sets
X = combined_data['text']
y = combined_data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to features
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_vec, y_train)

# Evaluate model
y_pred = model.predict(X_test_vec)
y_pred_proba = model.predict_proba(X_test_vec)[:, 1]  # Probability of positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Use weighted average for multiclass
auc = roc_auc_score(y_test, y_pred_proba)  # AUC for binary classification

print(f"F1 Score: {f1}")
print(f"AUC Score: {auc}")

# Function to classify new messages
def classify_messages(messages):
    messages_vec = vectorizer.transform(messages)
    predictions = model.predict(messages_vec)
    prediction_labels = label_encoder.inverse_transform(predictions)
    return list(zip(messages, prediction_labels))

# Test the function with a list of 10 messages (5 English, 5 German)
test_messages = [
    "Hey, how are you doing? Long time no see!",
    "You've won a free ticket to the Bahamas. Call now to claim!",
    "Let's catch up over coffee next week.",
    "Your account has been compromised. Please reset your password immediately.",
    "Don't forget to submit the report by Monday.",
    "Wie geht es dir? Lange nicht mehr gesehen!",
    "Sie haben einen kostenlosen Flug auf die Bahamas gewonnen. Rufen Sie jetzt an, um Ihren Gewinn zu beanspruchen!",
    "Lass uns nächste Woche auf einen Kaffee treffen.",
    "Ihr Konto wurde kompromittiert. Bitte setzen Sie sofort Ihr Passwort zurück.",
    "Vergessen Sie nicht, den Bericht bis Montag einzureichen."
]

classified_messages = classify_messages(test_messages)
for message, label in classified_messages:
    print(f"Message: {message}\nClassified as: {label}\n")


F1 Score: 0.9791318196371965
AUC Score: 0.9756279327943588
Message: Hey, how are you doing? Long time no see!
Classified as: ham

Message: You've won a free ticket to the Bahamas. Call now to claim!
Classified as: spam

Message: Let's catch up over coffee next week.
Classified as: ham

Message: Your account has been compromised. Please reset your password immediately.
Classified as: ham

Message: Don't forget to submit the report by Monday.
Classified as: ham

Message: Wie geht es dir? Lange nicht mehr gesehen!
Classified as: ham

Message: Sie haben einen kostenlosen Flug auf die Bahamas gewonnen. Rufen Sie jetzt an, um Ihren Gewinn zu beanspruchen!
Classified as: spam

Message: Lass uns nächste Woche auf einen Kaffee treffen.
Classified as: ham

Message: Ihr Konto wurde kompromittiert. Bitte setzen Sie sofort Ihr Passwort zurück.
Classified as: ham

Message: Vergessen Sie nicht, den Bericht bis Montag einzureichen.
Classified as: ham



In [16]:
# !pip install onnx onnxmltools skl2onnx
import onnxmltools
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Define the initial type
initial_type = [('float_input', FloatTensorType([None, X_train_vec.shape[1]]))]

# Convert the model
onnx_model = convert_sklearn(model, initial_types=initial_type)

# Save the ONNX model
with open("xgboost_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


/bin/bash: /home/philipp/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting skl2onnx
  Obtaining dependency information for skl2onnx from https://files.pythonhosted.org/packages/26/80/836824c62ff0923b4c3b8af8332170bdc3ccb469a220535b40405a93b4fb/skl2onnx-1.16.0-py2.py3-none-any.whl.metadata
  Downloading skl2onnx-1.16.0-py2.py3-none-any.whl.metadata (3.2 kB)
Downloading skl2onnx-1.16.0-py2.py3-none-any.whl (298 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.5/298.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m31m2.4 MB/s[0m eta [36m0:00:01[0m
[?25h[33mDEPRECATION: pytorch-lightning 1.6.4 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github

MissingShapeCalculator: Unable to find a shape calculator for type '<class 'xgboost.sklearn.XGBClassifier'>'.
It usually means the pipeline being converted contains a
transformer or a predictor with no corresponding converter
implemented in sklearn-onnx. If the converted is implemented
in another library, you need to register
the converted so that it can be used by sklearn-onnx (function
update_registered_converter). If the model is not yet covered
by sklearn-onnx, you may raise an issue to
https://github.com/onnx/sklearn-onnx/issues
to get the converter implemented or even contribute to the
project. If the model is a custom model, a new converter must
be implemented. Examples can be found in the gallery.
