Preprocessing

In [27]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib
import pickle

# Load the dataset
data = pd.read_csv("/content/Language Detection.csv")

# Function for text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]
    processed_text = " ".join(filtered_text)
    return processed_text

# Apply preprocessing to the 'Text' column
data['Processed_Text'] = data['Text'].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['Processed_Text'])

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Convert labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Language'])

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert sparse matrices to dense for MLP
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Define the MLP model
mlp_model = Sequential([
    Dense(64, input_dim=X_train_dense.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])

# Compile the MLP model
mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the MLP model
mlp_history = mlp_model.fit(X_train_dense, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the MLP model
mlp_y_pred = np.argmax(mlp_model.predict(X_test_dense), axis=-1)
mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
mlp_precision = precision_score(y_test, mlp_y_pred, average='weighted')
mlp_recall = recall_score(y_test, mlp_y_pred, average='weighted')
mlp_f1 = f1_score(y_test, mlp_y_pred, average='weighted')

print("Evaluation Metrics for MLP:")
print(f"Accuracy: {mlp_accuracy}")
print(f"Precision: {mlp_precision}")
print(f"Recall: {mlp_recall}")
print(f"F1 Score: {mlp_f1}")

# Initialize other classifiers
classifiers = {
    "SVM": SVC(kernel='linear'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression()
}

# Train and evaluate each classifier
best_model = None
best_accuracy = 0

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)  # no need to convert to dense
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\nEvaluation Metrics for {name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = classifier

# Check if MLP is the best model
if mlp_accuracy > best_accuracy:
    best_model = mlp_model
    model_type = 'MLP'
else:
    model_type = 'Classifier'

print(f"\nBest Model ({model_type}): {best_model}")

# Save the best model to a file
if model_type == 'MLP':
    mlp_model.save('best_model_mlp.h5')
else:
    joblib.dump(best_model, 'best_model_classifier.pkl')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Evaluation Metrics for MLP:
Accuracy: 0.9743713733075435
Precision: 0.9760084601459276
Recall: 0.9743713733075435
F1 Score: 0.9746678723490175

Evaluation Metrics for SVM:
Accuracy: 0.925531914893617
Precision: 0.9471013470450899
Recall: 0.925531914893617
F1 Score: 0.9302167779900706

Evaluation Metrics for Naive Bayes:
Accuracy: 0.9279497098646035
Precision: 0.9454426709374064
Recall: 0.9279497098646035
F1 Score: 0.929817778144432

Evaluation Metrics for Random Forest:
Accuracy: 0.9250483558994197
Precision: 0.9385156019731358
Recall: 0.9250483558994197
F1 Score: 0.9274217802367858

Evaluation Metrics for Logistic Regression:
Accuracy: 0.9076402321083172
Precision: 0.93878374310672
Recall: 0.9076402321083172
F1 Score: 0.9141610922914473

Best Model (MLP): <keras.src.engine.sequential.Sequential object at 0x7f25961bd990>


  saving_api.save_model(


In [30]:
from google.colab import files

files.download('/content/best_model_mlp.h5')
files.download('/content/label_encoder.pkl')
files.download('/content/tfidf_vectorizer.pkl')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import load_model
import tensorflow as tf
import pickle

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the saved TF-IDF vectorizer
with open('/content/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load the saved MLP model
model = load_model('/content/best_model_mlp.h5')

# Load the label encoder used during training
with open('/content/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)  # 

# Function for text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]
    processed_text = " ".join(filtered_text)
    return processed_text

# Get input from the user
user_input = input('Enter a sentence to predict its language: ')

# Preprocess the user input
print("User Input (before preprocessing):", user_input)
processed_input = preprocess_text(user_input)
print("Processed Input:", processed_input)

# TF-IDF Vectorization
X_input = tfidf_vectorizer.transform([processed_input])  # Transform the processed input

# Convert the sparse matrix to a TensorFlow SparseTensor
X_input_coo = X_input.tocoo()
X_input_sparse_tensor = tf.sparse.SparseTensor(
    indices=np.vstack((X_input_coo.row, X_input_coo.col)).T,
    values=X_input_coo.data,
    dense_shape=X_input_coo.shape
)

# Reorder the sparse tensor indices if necessary
X_input_reordered = tf.sparse.reorder(X_input_sparse_tensor)

# Predict the language index
prediction = np.argmax(model.predict(X_input_reordered), axis=-1)

# Map the predicted language index back to the original language label
predicted_language = label_encoder.inverse_transform([prediction[0]])

# Print the predicted language
print(f'Predicted Language: {predicted_language[0]}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter a sentence to predict its language: Ciao Baby
User Input (before preprocessing): Ciao Baby
Processed Input: ciao baby
Predicted Language: Italian
