In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import kdeplot
from bs4 import BeautifulSoup
import re, string 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
import unicodedata
import html
import numpy as np
import nltk

In [None]:
!pip install BeautifulSoup4

In [None]:
train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train_sample = train.sample(frac = 0.1, random_state = 42, axis = 'index')

## Text Cleaning

In [None]:
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
import unicodedata
import html

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_text(text):
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
#     # strip html
#     soup = BeautifulSoup(text, 'html.parser')
#     text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    return text.lower()

In [None]:
train_data = train
train_data['threat'] = np.where(train_data['threat'] >= .25, 1, 0)
train_data['severe_toxicity'] = np.where(train_data['severe_toxicity'] >= .25, 1, 0)
train_data['insult'] = np.where(train_data['insult'] >= .25, 1, 0)
train_data['obscene'] = np.where(train_data['obscene'] >= .25, 1, 0)
train_data['identity_attack'] = np.where(train_data['identity_attack'] >= .25, 1, 0)

In [None]:
possible_labels = ["comment_text","target","severe_toxicity", "obscene", "threat", "insult", "identity_attack"]


In [None]:
train_data_final = train_data[possible_labels]
train_data_final = train_data_final[(train_data_final['target']>=.5)]
train_data_final.head()

In [None]:
possible_labels1 = ["severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
targets = train_data_final[possible_labels].values

In [None]:
len(train_data_final)

In [None]:
train_data_final['clean_comment_text'] = train_data_final['comment_text'].apply(clean_text)

In [None]:
train_data_final.head(5)

In [None]:
# Sequences creation, truncation and padding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Setting up the tokenizer
vocab_size = 10000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')
tokenizer.fit_on_texts(list(train_data_final['clean_comment_text']))

max_len = 18
X_train_seq = tokenizer.texts_to_sequences(train_data_final['clean_comment_text'])

X_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')
#X_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')


print(f"X_train shape: {X_train_seq.shape}")


In [None]:
y_train = np.array(train_data_final[possible_labels1]).astype(int)
print(f"y_train shape: {y_train.shape}")

## Train Validation Split

In [None]:
from sklearn.model_selection import train_test_split

X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size = 0.2, random_state = 42)

print(f"X_train shape: {X_train_seq.shape}")
print(f"X_val shape: {X_val_seq.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

In [None]:
num_words = len(tokenizer.word_index)
print(f"Number of unique words: {num_words}")

In [None]:
import numpy as np

def load_embeddings(embedding_file):
    embedding_dict = {}
    with open(embedding_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embedding_dict[word] = vector
    return embedding_dict

# Specify the path to your GloVe file
embedding_file = '/kaggle/input/addedd/glove.42B.300d.txt'
embedding_dict = load_embeddings(embedding_file)


In [None]:
# Applying GloVE representations on our corpus

embedding_matrix=np.zeros((num_words,300))

for word,i in tokenizer.word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec    
            
embedding_matrix.shape

## LSTM

In [None]:
# Setting up the model

from keras import layers
from keras.models import Sequential

def setup_lstm_model(max_len, n_latent_factors):
    
    model = Sequential()
    model.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], 
                               input_length = max_len, trainable = False))
    model.add(layers.LSTM(units = max_len, return_sequences = True))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dense(units = 5, activation = 'sigmoid'))
    
    return model

In [None]:
lstm_model = setup_lstm_model(max_len = max_len, n_latent_factors = 300)
lstm_model.summary()

In [None]:
# Final hyperparameter configurations
lstm_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',metrics=['accuracy'])

batch_size = 128
epochs = 10

lstm_model.fit(X_train_seq, y_train, epochs=epochs, batch_size = batch_size, validation_data = (X_val_seq, y_val))

In [None]:
lstm_model.history.history

In [None]:
from sklearn.metrics import accuracy_score

y_pred = lstm_model.predict(X_val_seq)
y_pred_binary = (y_pred >= 0.5).astype(int)  # Convert probabilities to binary labels
accuracy = accuracy_score(y_val, y_pred_binary)
print(accuracy)

In [None]:
from sklearn.metrics import precision_score, recall_score

# Assuming you have binary matrices for true labels (y_val) and predicted labels (y_pred)

# Calculate precision without specifying average
precision = precision_score(y_val, y_pred_binary, average=None)

# Calculate recall without specifying average
recall = recall_score(y_val, y_pred_binary, average=None)

# Print the results for each label
for label in range(y_val.shape[1]):
    label_precision = precision[label]
    label_recall = recall[label]
    print(f'Label {label} - Precision: {label_precision:.4f}, Recall: {label_recall:.4f}')


In [None]:
from sklearn.metrics import f1_score

# Assuming you have binary matrices for true labels (y_val) and predicted labels (y_pred_binary)

# Calculate F1-score without specifying average
f1 = f1_score(y_val, y_pred_binary, average=None)

# Print the results for each label
for label in range(y_val.shape[1]):
    label_f1 = f1[label]
    print(f'Label {label} - F1-Score: {label_f1:.4f}')


In [None]:
testing=pd.read_csv("/kaggle/input/testdataset/testing.csv")
import numpy as np

# Initialize an empty list to store the predicted toxicity scores
predicted_toxicity_scores = []

# Loop through each comment in the test data
for comment in testing['comment_text']:
    # Preprocess the comment
    preprocessed_comment = clean_text(comment)  # Apply your text cleaning and preprocessing
    
    # Tokenize and convert the preprocessed comment into a sequence
    comment_seq = tokenizer.texts_to_sequences([preprocessed_comment])
    comment_seq = pad_sequences(comment_seq, maxlen=max_len)  # Make sure max_len matches your training data
    
    # Make a prediction for the comment
    toxicity_score = lstm_model.predict(comment_seq)
    
    # Append the predicted toxicity score to the list
    predicted_toxicity_scores.append(toxicity_score)

# Convert the list of predicted toxicity scores to a NumPy array for further analysis
predicted_toxicity_scores = np.array(predicted_toxicity_scores)


In [None]:
num_samples, num_predictions, num_labels = predicted_toxicity_scores.shape
predicted_scores_2d = predicted_toxicity_scores.reshape(num_samples, num_labels)

# Create a DataFrame with columns for each label
column_names = ["severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
predicted_scores_df = pd.DataFrame(data=predicted_scores_2d, columns=column_names)

In [None]:
predicted_scores_df