In [None]:
import camel_tools
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils.normalize import (
    normalize_alef_maksura_ar,
    normalize_teh_marbuta_ar,
    normalize_ligatures_ar,
    normalize_hamza_ar,
    normalize_unicode_ar,
    normalize_hamza_norm_ar,
    normalize_hamza_above_ar,
    normalize_hamza_below_ar,
    normalize_tatweel_ar,
    normalize_yah_ar,
    normalize_waw_ar,
    normalize_shadda_ar,
    normalize_fatha_ar,
    normalize_damma_ar,
    normalize_kasra_ar,
    normalize_sukun_ar,
    normalize_tanwin_ar,
    normalize_tashkeel_ar,
    normalize_lamaleph_ar,
    normalize_lamaleph_norm_ar,
    normalize_lamaleph_above_ar,
    normalize_lamaleph_below_ar,
    normalize_lamaleph_shadda_ar,
    normalize_lamaleph_shadda_norm_ar,
    normalize_lamaleph_shadda_above_ar,
    normalize_lamaleph_shadda_below_ar,
    normalize_lamaleph_shadda_tashkeel_ar,
    normalize_lamaleph_shadda_tashkeel_norm_ar,
    normalize_lamaleph_shadda_tashkeel_above_ar,
    normalize_lamaleph_shadda_tashkeel_below_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_norm_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_above_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_below_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_norm_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_above_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_below_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_norm_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_above_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_below_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_norm_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_above_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_below_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_shadda_ar,
    normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_shadda_norm_ar
    # ... (import other normalization functions)
)

def clean_and_normalize_arabic_text(text: str) -> str:
    """
    Clean and normalize Arabic text using `CaMEL` library.
    """
    words = simple_word_tokenize(text)
    
    # Apply various cleaning and normalization functions
    cleaned_words = [dediac_ar(word) for word in words]
    cleaned_words = [normalize_alef_maksura_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_teh_marbuta_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_ligatures_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_hamza_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_unicode_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_hamza_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_hamza_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_hamza_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_tatweel_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_yah_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_waw_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_shadda_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_fatha_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_damma_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_kasra_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_sukun_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_tanwin_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_tashkeel_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_norm_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_above_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_below_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_shadda_ar(word) for word in cleaned_words]
    cleaned_words = [normalize_lamaleph_shadda_tashkeel_shadda_tashkeel_shadda_tashkeel_shadda_norm_ar(word) for word in cleaned_words]

    # Join the cleaned words back into a text string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Read dataset
data = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the path to your dataset

# Split dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Clean and normalize the Arabic text in the 'Matn' column
train_data['Cleaned_Text'] = train_data['Matn'].apply(clean_and_normalize_arabic_text)
test_data['Cleaned_Text'] = test_data['Matn'].apply(clean_and_normalize_arabic_text)

# TF-IDF Vectorization with n-gram support (unigrams, bigrams, trigrams)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_train = tfidf_vectorizer.fit_transform(train_data['Cleaned_Text'])
X_test = tfidf_vectorizer.transform(test_data['Cleaned_Text'])

# Initialize models with specified hyperparameters
models = {
    'GaussianNB': GaussianNB(var_smoothing=1e-9),
    'LogisticRegression': LogisticRegression(penalty='l2', multi_class='auto'),
    'SVM': SVC(kernel='rbf', C=4, gamma=0.125),
    'MLP': MLPClassifier(hidden_layer_sizes=(100), batch_size='auto', activation='relu', solver='adam')
}

# Train and test models
for name, model in models.items():
    model.fit(X_train, train_data['Label'])
    predictions = model.predict(X_test)

    # Calculate metrics (precision, recall, f1-score)
    metrics = classification_report(test_data['Label'], predictions)
    print(f"Metrics for {name}:")
    print(metrics)


In [None]:
# The below code to find the best hyperparameters for the models.
# You can use GridSearchCV from the sklearn.model_selection module. 
# This allows you to perform an exhaustive search over a specified parameter grid to find the best parameters.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Rest of your code to load the data and clean text remains the same

# Parameters grid for Logistic Regression
param_grid_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

# GridSearchCV for Logistic Regression
logreg = LogisticRegression(max_iter=1000, multi_class='auto')
grid_search_logreg = GridSearchCV(logreg, param_grid=param_grid_logreg, cv=3, scoring='accuracy')
grid_search_logreg.fit(X_train, train_data['Label'])

# Best parameters and best score for Logistic Regression
print("Best Parameters for Logistic Regression:", grid_search_logreg.best_params_)
print("Best Score for Logistic Regression:", grid_search_logreg.best_score_)


In [None]:
from sklearn.naive_bayes import GaussianNB

# Parameters grid for GaussianNB (Gaussian Naive Bayes doesn't have many hyperparameters)
param_grid_gnb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]  # You can adjust these values based on your needs
}

# GridSearchCV for GaussianNB
gnb = GaussianNB()
grid_search_gnb = GridSearchCV(gnb, param_grid=param_grid_gnb, cv=3, scoring='accuracy')
grid_search_gnb.fit(X_train.toarray(), train_data['Label'])

# Best parameters and best score for GaussianNB
print("Best Parameters for GaussianNB:", grid_search_gnb.best_params_)
print("Best Score for GaussianNB:", grid_search_gnb.best_score_)


In [None]:
from sklearn.svm import SVC

# Parameters grid for SVM
param_grid_svm = {
    'C': [1, 10, 100],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'linear']
    # Add more parameters as needed
}

# GridSearchCV for SVM
svm = SVC()
grid_search_svm = GridSearchCV(svm, param_grid=param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train, train_data['Label'])

# Best parameters and best score for SVM
print("Best Parameters for SVM:", grid_search_svm.best_params_)
print("Best Score for SVM:", grid_search_svm.best_score_)


In [None]:
from sklearn.neural_network import MLPClassifier

# Parameters grid for MLP
param_grid_mlp = {
    'hidden_layer_sizes': [(100,), (50,50), (100,50,100)],
    'alpha': [0.0001, 0.001, 0.01],
    'activation': ['logistic', 'tanh', 'relu']
    # Add more parameters as needed
}

# GridSearchCV for MLP
mlp = MLPClassifier(max_iter=500)
grid_search_mlp = GridSearchCV(mlp, param_grid=param_grid_mlp, cv=3, scoring='accuracy')
grid_search_mlp.fit(X_train, train_data['Label'])

# Best parameters and best score for MLP
print("Best Parameters for MLP:", grid_search_mlp.best_params_)
print("Best Score for MLP:", grid_search_mlp.best_score_)
