data loading and Prepocessing

In [21]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight

# Load datasets
train_df = pd.read_csv("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\train_trimmed.csv", encoding='utf-8')
test_df = pd.read_csv("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\test_trimmed.csv", encoding='utf-8')

# Preprocess commentText column
def clean_comment_column(df):
    df['commentText'] = df['commentText'].astype(str).str.strip()
    df['commentText'].replace('', np.nan, inplace=True)
    df.dropna(subset=['commentText'], inplace=True)
    return df

# Keep only labels 0 and 1
train_df = train_df[train_df['label'].isin([0, 1])]

# Clean text columns
train_df = clean_comment_column(train_df)
test_df = clean_comment_column(test_df)

#Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weight_dict = dict(zip(np.unique(train_df['label']), class_weights))
print(f"Class weights: {class_weight_dict}")

Class weights: {0.0: 0.7306224310041104, 1.0: 1.5840229153405474}


In [23]:
# Total number of training samples
total_samples = len(test_df)

# Count of each label
label_counts = train_df['label'].value_counts().sort_index()

print(f"Total samples: {total_samples}")
print(f"Label 0 (Non-Harassment): {label_counts[0]}")
print(f"Label 1 (Harassment): {label_counts[1]}")


Total samples: 5000
Label 0 (Non-Harassment): 3406
Label 1 (Harassment): 1571


Preprocessing

In [26]:
import re
import unicodedata
import emoji

# Text cleaning function
def deep_clean_text(text):
    # Fix common unicode issues
    text = unicodedata.normalize("NFKC", text)
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')  # Removes all emojis cleanly
    
    # Remove mentions (@username)
    text = re.sub(r'@[\w]+', '', text)  # Remove mentions (e.g., @username)
    
    # Remove URLs (http or https)
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs (http:// or www.)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply it to the commentText columns
train_df['commentText'] = train_df['commentText'].apply(deep_clean_text)
test_df['commentText'] = test_df['commentText'].apply(deep_clean_text)


In [28]:
# Initialize mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize and encode sequences
def encode_texts(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'  # Changed to tf tensors
        )
        
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

In [30]:
# Encode train and test data
X_train_ids, X_train_masks = encode_texts(train_df['commentText'], tokenizer)
X_test_ids, X_test_masks = encode_texts(test_df['commentText'], tokenizer)

y_train = train_df['label'].values

# Convert y_train to one-hot encoding for categorical crossentropy
y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=2)


fetaure extration

In [36]:
import tensorflow as tf
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

# Define a custom BERT wrapper layer
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')
    def build(self, input_shape):
        self.bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        super(BertLayer, self).build(input_shape)
    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Return the [CLS] token representation
    def get_config(self):
        config = super(BertLayer, self).get_config()
        return config


# Define inputs
input_ids = Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(128,), dtype=tf.int32, name='attention_mask')

# Use the custom BertLayer
bert_layer = BertLayer()
pooled_output = BertLayer()([input_ids, attention_mask])

# Add classifier layers
x = Dense(128, activation='relu')(pooled_output)
x = Dropout(0.3)(x)
output = Dense(2, activation='softmax')(x)



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model

In [39]:
# Build the model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=2e-5),
    loss='categorical_crossentropy',  # Using categorical since we have one-hot encoded targets
    metrics=['accuracy']
)

In [40]:
from sklearn.model_selection import train_test_split

# Split train into train and validation (10% for validation)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)


In [None]:
history = model.fit(
    [X_train_ids, X_train_masks], 
    y_train_onehot,
    validation_split=0.1,
    epochs=1, class_weight=class_weight_dict,
    batch_size=32
)


[1m114/140[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m4:03[0m 9s/step - accuracy: 0.5619 - loss: 0.6873

In [None]:
model.save('mbert_model.keras')  # Recommended format

In [None]:
from keras.models import load_model

loaded_model = load_model('mbert_model.keras', custom_objects={'BertLayer': BertLayer})


In [None]:
# Extract validation data from the training set (using the 10% that was used as validation during training)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Get indices for the validation split (last 10% of the training data)
val_split = 0.1
val_size = int(len(X_train_ids) * val_split)
train_size = len(X_train_ids) - val_size

# Get validation data
X_val_ids = X_train_ids[train_size:]
X_val_masks = X_train_masks[train_size:]
y_val_true = np.argmax(y_train_onehot[train_size:], axis=1)

# Make predictions on validation data
val_predictions = model.predict([X_val_ids, X_val_masks])
y_val_pred = np.argmax(val_predictions, axis=1)

# Calculate performance metrics
accuracy = accuracy_score(y_val_true, y_val_pred)
precision = precision_score(y_val_true, y_val_pred)
recall = recall_score(y_val_true, y_val_pred)
f1 = f1_score(y_val_true, y_val_pred)

# Display metrics
print("\n===== Model Performance on Validation Set =====")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Generate detailed classification report
print("\n===== Classification Report =====")
print(classification_report(y_val_true, y_val_pred))

# Create confusion matrix
cm = confusion_matrix(y_val_true, y_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.show()

# Visualize training history
plt.figure(figsize=(12, 5))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.savefig("training_history.png")
plt.show()

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Parameters for the LSTM model
max_words = 10000  # Maximum vocabulary size
max_len = 128      # Maximum sequence length
embedding_dim = 100  # Embedding dimension

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['commentText'])

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df['commentText'])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)

# Create training and validation splits (matching what you did with mBERT)
val_split = 0.1
val_size = int(len(X_train_padded) * val_split)
train_size = len(X_train_padded) - val_size

X_train_lstm = X_train_padded[:train_size]
X_val_lstm = X_train_padded[train_size:]
y_train_lstm = y_train_onehot[:train_size]
y_val_lstm = y_train_onehot[train_size:]

# Build the LSTM model
lstm_model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compile the model
lstm_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train the model
lstm_history = lstm_model.fit(
    X_train_lstm,
    y_train_lstm,
    epochs=10,
    batch_size=32,
    validation_data=(X_val_lstm, y_val_lstm)
)

# Evaluate LSTM model on validation data
y_val_lstm_pred = lstm_model.predict(X_val_lstm)
y_val_lstm_pred_classes = np.argmax(y_val_lstm_pred, axis=1)
y_val_lstm_true = np.argmax(y_val_lstm, axis=1)

# Calculate metrics for LSTM model
lstm_accuracy = accuracy_score(y_val_lstm_true, y_val_lstm_pred_classes)
lstm_precision = precision_score(y_val_lstm_true, y_val_lstm_pred_classes)
lstm_recall = recall_score(y_val_lstm_true, y_val_lstm_pred_classes)
lstm_f1 = f1_score(y_val_lstm_true, y_val_lstm_pred_classes)

print("\n===== LSTM Model Performance on Validation Set =====")
print(f"Accuracy: {lstm_accuracy:.4f}")
print(f"Precision: {lstm_precision:.4f}")
print(f"Recall: {lstm_recall:.4f}")
print(f"F1 Score: {lstm_f1:.4f}")

print("\n===== LSTM Classification Report =====")
print(classification_report(y_val_lstm_true, y_val_lstm_pred_classes))

# Prepare test data for LSTM predictions
X_test_seq = tokenizer.texts_to_sequences(test_df['commentText'])
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

# Make predictions on test data with LSTM model
lstm_test_predictions = lstm_model.predict(X_test_padded)
lstm_predicted_classes = np.argmax(lstm_test_predictions, axis=1)


# Compare the two models
print("\n===== Model Comparison =====")
print(f"{'Metric':<15} {'mBERT':<10} {'LSTM':<10}")
print(f"{'-'*35}")
print(f"{'Accuracy':<15} {accuracy:<10.4f} {lstm_accuracy:<10.4f}")
print(f"{'Precision':<15} {precision:<10.4f} {lstm_precision:<10.4f}")
print(f"{'Recall':<15} {recall:<10.4f} {lstm_recall:<10.4f}")
print(f"{'F1 Score':<15} {f1:<10.4f} {lstm_f1:<10.4f}")

# Visualize comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
mbert_scores = [accuracy, precision, recall, f1]
lstm_scores = [lstm_accuracy, lstm_precision, lstm_recall, lstm_f1]

plt.figure(figsize=(10, 6))
x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, mbert_scores, width, label='mBERT')
plt.bar(x + width/2, lstm_scores, width, label='LSTM')

plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Performance Comparison: mBERT vs LSTM')
plt.xticks(x, metrics)
plt.ylim(0, 1.0)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig("model_comparison.png")
plt.show()

In [None]:
# Save mBERT predictions
mbert_submission_df = test_df.copy()
mbert_submission_df['predicted_label'] = np.argmax(model.predict([X_test_ids, X_test_masks]), axis=1)
mbert_submission_df.to_excel("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\mbert_predictions.xlsx", index=False)
print("mBERT Predictions saved to 'mbert_predictions.xlsx'")


In [None]:
# Create submission DataFrame for LSTM
lstm_submission_df = test_df.copy()
lstm_submission_df['predicted_label'] = lstm_predicted_classes
lstm_submission_df.to_excel("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\lstm_predictions.xlsx", index=False)
print("LSTM Predictions saved to 'lstm_predictions.xlsx'")


In [None]:
import joblib  # or pickle, or torch.save() depending on your model

# Example for a scikit-learn model
joblib.dump(lstm_model, 'lstm_model.pkl')


In [None]:
import joblib  # or pickle, or torch.save() depending on your model

# Example for a scikit-learn model
joblib.dump(model, 'bert_model.pkl')


In [None]:
!pip install fastapi uvicorn nest_asyncio openpyxl pandas


In [None]:
import nest_asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pandas as pd

# Allow running Uvicorn in Jupyter
nest_asyncio.apply()

# Load prediction Excel files
mbert_df = pd.read_excel("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\mbert_predictions.xlsx")
lstm_df = pd.read_excel("C:\\Users\\bivin\\Favorites\\Desktop\\internship\\lstm_predictions.xlsx")

# Ensure lowercase and no extra spaces for matching
mbert_df["commentText"] = mbert_df["commentText"].astype(str).str.strip().str.lower()
lstm_df["commentText"] = lstm_df["commentText"].astype(str).str.strip().str.lower()

# Setup FastAPI
app = FastAPI()

# Define the request body model
class SentenceInput(BaseModel):
    sentence: str

@app.get("/")
def read_root():
    return {"message": "Harassment Detection API is running"}

@app.post("/predict")
def predict(input_data: SentenceInput):
    sent = input_data.sentence.strip().lower()
    
    # Match using 'commentText' column instead of non-existent 'sentence'
    mbert_result = mbert_df[mbert_df["commentText"] == sent]
    lstm_result = lstm_df[lstm_df["commentText"] == sent]
    
    if mbert_result.empty or lstm_result.empty:
        raise HTTPException(status_code=404, detail="Sentence not found in predictions")

    return {
        "commentText": input_data.sentence,
        "model_mbert": int(mbert_result["predicted_label"].values[0]),
        "model_lstm": int(lstm_result["predicted_label"].values[0])
    }

# Run the API server
if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)


In [None]:
pip install fuzzywuzzy


In [None]:
pip install python-Levenshtein

In [None]:
import nest_asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn as nn

# Allow running Uvicorn in Jupyter
nest_asyncio.apply()

# Setup FastAPI
app = FastAPI()

# Load mBERT model and tokenizer
mbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
mbert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

# Load LSTM model (assuming the model is saved as a PyTorch model file)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        output = self.fc(hn[-1])
        return output

# Assuming the model is stored in 'lstm_model.pth'
lstm_model = LSTMModel(input_size=300, hidden_size=128, output_size=2)  # Example params
lstm_model.load_state_dict(torch.load("lstm_model.pth"))
lstm_model.eval()

# Class for input data
class SentenceInput(BaseModel):
    sentence: str

@app.get("/")
def read_root():
    return {"message": "Harassment Detection API is running"}

@app.post("/predict")
def predict(input_data: SentenceInput):
    # Get the sentence from input
    sent = input_data.sentence.strip()

    # Process and predict with mBERT
    mbert_input = mbert_tokenizer(sent, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        mbert_output = mbert_model(**mbert_input)
    mbert_pred = torch.argmax(mbert_output.logits, dim=1).item()  # Predict the label

    # Process and predict with LSTM (assuming sentence is tokenized and embedded)
    # Here, we use dummy embedding for LSTM prediction, replace with real embeddings
    sentence_embedding = torch.randn(1, 1, 300)  # Example random tensor, replace with actual embeddings
    with torch.no_grad():
        lstm_output = lstm_model(sentence_embedding)
    lstm_pred = torch.argmax(lstm_output, dim=1).item()

    return {
        "commentText": input_data.sentence,
        "model_mbert": mbert_pred,
        "model_lstm": lstm_pred
    }

# Run the API
uvicorn.run(app, host="127.0.0.1", port=8000)
