<a href="https://colab.research.google.com/github/gokul-sunil50/Multi-Task-Learning/blob/main/Multi_Task_Learning__with_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Detect Emotions, Violence, Hate from Text**



**Emotion**
0:Sadness,
1:Joy,
2:Love,
3:Anger,
4:Fear,
5:Surprise

**violence**
0:Harmful_Traditional_practice,
1:Physical_violence,
2:Economic_violence,
3:Emotional_violence,
4:Sexual_violence

**Hate**
0:Hate speech,
1:Offensive Speech,
2:Neither





**1. Loading the Data**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


In [None]:
emotion_df = pd.read_csv('emotion.csv')
emotion_df.head()


In [None]:
violence_df = pd.read_csv('violence.csv')
violence_df.head()

In [None]:
hate_df=pd.read_csv('hate.csv')
hate_df.head()

**2. Data Pre-processing**

In [None]:
#dropping unwanted columns
emotion_df.drop(columns = ['Unnamed: 0'], inplace=True)
violence_df.drop(columns = ['Tweet_ID'], inplace=True)
hate_df=hate_df[['tweet','class']]


In [None]:
emotion_df.head()

In [None]:
violence_df.head()

In [None]:
hate_df.head()


In [None]:
emotion_df.columns,violence_df.columns,hate_df.columns

In [None]:
#renaming the column
violence_df.rename(columns = {'tweet':'text','type':'label'}, inplace = True)
hate_df.rename(columns = {'tweet':'text','class':'label'}, inplace= True)

In [None]:
emotion_df.columns,violence_df.columns,hate_df.columns

In [None]:
#checking for null values
emotion_df.isna().sum(),violence_df.isna().sum(),hate_df.isna().sum()

In [None]:
emotion_df.shape,violence_df.shape,hate_df.shape

Taking(Extracting)12 thousand rows from each dataset


In [None]:
emotion_df['label'].value_counts()

In [None]:
e_df=pd.DataFrame()
for i in range(6):
  subset=emotion_df[emotion_df['label'] == i].sample(n=2000,random_state=42)
  e_df=pd.concat([e_df,subset])

In [None]:
e_df.shape

In [None]:
emotion_df=e_df.copy()

In [None]:
emotion_df['label'].value_counts()

In [None]:
violence_df['label'].value_counts()

In [None]:
#we just want 4998 rows from the sexual_violence label
sexual_violence=violence_df[violence_df['label']=='sexual_violence'].sample(n=4998,random_state=42)
violence_df=violence_df[violence_df['label']!='sexual_violence']

In [None]:
violence_df.shape

In [None]:
violence_df=pd.concat([sexual_violence,violence_df],axis = 0)

In [None]:
violence_df.shape

In [None]:
hate_df['label'].value_counts()

In [None]:
offensive_speech = hate_df[hate_df['label']==1].sample(n=6407,random_state=42)
hate_df=hate_df[hate_df['label']!=1]

In [None]:
hate_df.shape

In [None]:
hate_df=pd.concat([offensive_speech,hate_df],axis = 0)

In [None]:
hate_df.shape

In [None]:
emotion_df.shape,violence_df.shape,hate_df.shape

In [None]:
emotion_df.head(3)

In [None]:
violence_df.head(3)

In [None]:
hate_df.head(3)

In [None]:
#resetting the indexes
emotion_df.reset_index(drop = True, inplace=True)
violence_df.reset_index(drop=True,inplace=True)
hate_df.reset_index(drop=True,inplace=True)


In [None]:
emotion_df.head(3)

In [None]:
violence_df.head(3)

In [None]:
hate_df.head(3)

**3.Label Encoding**


In [None]:
label_encoder = LabelEncoder()
violence_df['label']= label_encoder.fit_transform(violence_df['label'])
hate_df['label'] = label_encoder.fit_transform(hate_df['label'])


In [None]:
violence_df.head()

In [None]:
violence_df['label'].unique()

In [None]:
hate_df.head()

In [None]:
hate_df['label'].unique()

**4.Stopwords Removal**

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
#loading the stopwords
stop_words = set(stopwords.words('english'))

In [None]:
len(stop_words)

In [None]:
#stopwords removal function
def remove_stopwords(text):
  all_words = nltk.word_tokenize(text)
  filtered_words= [word for word in all_words if word.lower()not in stop_words]
  return ' '.join(filtered_words) # Join with a space
emotion_df['text'] = emotion_df['text'].apply(remove_stopwords)
violence_df['text'] = violence_df['text'].apply(remove_stopwords)
hate_df['text'] = hate_df['text'].apply(remove_stopwords)


In [None]:
emotion_df.head(3)

In [None]:
violence_df.head(3)

In [None]:
hate_df.head(3)

**5.Tokenization & Padding**

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([emotion_df['text'],violence_df['text'],hate_df['text']]))

In [None]:
sample_size = 5593 # Based on the size of violence_df

emotion_df = emotion_df.sample(n=sample_size, random_state=42)
violence_df = violence_df.sample(n=sample_size, random_state=42) # violence_df already has 5593
hate_df = hate_df.sample(n=sample_size, random_state=42)

In [None]:
emotion_sequences = tokenizer.texts_to_sequences(emotion_df['text'])
violence_sequences = tokenizer.texts_to_sequences(violence_df['text'])
hate_sequences = tokenizer.texts_to_sequences(hate_df['text'])

In [None]:
emotion_df['text'].iloc[2]

In [None]:
emotion_sequences[2:3]

In [None]:
max_length = 50
emotion_padded = pad_sequences(emotion_sequences, maxlen= max_length, padding='post')
violence_padded = pad_sequences(violence_sequences, maxlen=max_length, padding='post')
hate_padded = pad_sequences(hate_sequences, maxlen= max_length, padding= 'post')

In [None]:
emotion_padded[2:3]

In [None]:
#generating labels in numpy array format
emotion_labels= np.array(emotion_df['label'])
violence_labels= np.array(violence_df['label'])
hate_labels= np.array(hate_df['label'])

**6.Model Definition**

In [None]:
#prepare separate inputs for each dataset
emotion_input = emotion_padded
violence_input = violence_padded
hate_input = hate_padded

In [None]:
#defining multiple input layers for each task
emotion_input_layer = keras.layers.Input(shape = (max_length,), name = 'emotion_input')
violence_input_layer = keras.layers.Input(shape = (max_length,), name = 'violence_input')
hate_input_layer = keras.layers.Input(shape = (max_length,), name = 'hate_input')

In [None]:
#use as Shared embedding layer
embedding_layer = keras.layers.Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim =128)

In [None]:
#APPLY THE EMBEDDING LAYER TO EACH INPUT
emotion_embedding = embedding_layer(emotion_input_layer)
violence_embedding = embedding_layer(violence_input_layer)
hate_embedding = embedding_layer(hate_input_layer)

In [None]:
#shared LSTM layer
shared_lstm = keras.layers.LSTM(64, return_sequences=True)

In [None]:
emotion_lstm = shared_lstm(emotion_embedding)
violence_lstm = shared_lstm(violence_embedding)
hate_lstm = shared_lstm(hate_embedding)


In [None]:
#shared global average pooling layer and dropout layer
shared_pooling = keras.layers.GlobalAveragePooling1D()
shared_dropout = keras.layers.Dropout(0.5)


In [None]:
emotion_features =shared_dropout(shared_pooling(emotion_lstm))
violence_features =shared_dropout(shared_pooling(violence_lstm))
hate_features =shared_dropout(shared_pooling(hate_lstm))

In [None]:
len(emotion_df['label'].unique()),len(violence_df['label'].unique()),len(hate_df['label'].unique())

In [None]:
#output layers
emotion_output = keras.layers.Dense(6, activation = 'softmax', name='emotion_output')(emotion_features)
violence_output = keras.layers.Dense(5, activation = 'softmax', name= 'violence_output')(violence_features)
hate_output = keras.layers.Dense(3, activation = 'softmax', name= 'hate_output')(hate_features)

In [None]:
#compile the model with multiple inputs and outputs
model = keras.models.Model(inputs = [emotion_input_layer, violence_input_layer, hate_input_layer],outputs = [emotion_output, violence_output, hate_output])
model.compile(optimizer = 'adam',
              loss={
                  'emotion_output': 'sparse_categorical_crossentropy',
                  'violence_output': 'sparse_categorical_crossentropy',
                  'hate_output':'sparse_categorical_crossentropy'
              },
              metrics = {
                  'emotion_output' :'accuracy',
                  'violence_output':'accuracy',
                  'hate_output': 'accuracy'
                  })


In [None]:
model.summary()

In [None]:
#trainng the model with separate inputs
model.fit(x = {'emotion_input' : emotion_input,
               'violence_input' : violence_input,
               'hate_input' : hate_input},
          y = {'emotion_output' : emotion_labels,
               'violence_output' : violence_labels,
               'hate_output' : hate_labels},
          epochs = 10,
          batch_size =4)

**7.Prediction and Evaluation**

In [None]:
prediction = model.predict({'emotion_input': emotion_input,
                            'violence_input': violence_input,
                            'hate_input': hate_input})

In [None]:
prediction

In [None]:
emotion_pred = np.argmax(prediction[0],axis = 1)
violence_pred = np.argmax(prediction[1],axis = 1)
hate_pred = np.argmax(prediction[2], axis =1)

In [None]:
violence_df['label'].unique()

In [None]:
def plot_cm(true, pred, title, labels):
  cf = confusion_matrix(true, pred, normalize = 'true')
  plt.figure(figsize = (7,6))
  sns.heatmap(cf , annot =True , cmap = 'Blues', xticklabels=labels, yticklabels = labels)
  plt.title(title)
  plt.ylabel('Actual')
  plt.xlabel('Predicted')

emotion_labels_text=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
violence_labels_text=['sexual_violence', 'physical_violence','emotional_violence','Harmful_traditional_practice','economic_violence']
hate_labels_text=['offensive speech', 'Neither', 'Hate speech']


In [None]:
plot_cm(emotion_labels, emotion_pred, 'Confusion Matrix for Emotion', emotion_labels_text)
plot_cm(violence_labels, violence_pred, 'Confusion Matrix for Violence', violence_labels_text)
plot_cm(hate_labels, hate_pred, 'Confusion Matrix for Hate', hate_labels_text)


**8.Manual Testing**

In [None]:
def classify_text(input_text):
    #preprocess the input text
    input_text_cleaned = remove_stopwords(input_text)
    input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
    input_padded = pad_sequences(input_sequence, maxlen = max_length, padding = 'post')

    #prediction - Predict only on the single input text
    predictions = model.predict({'emotion_input': input_padded,
                                 'violence_input': input_padded, # Use input_padded for all inputs
                                 'hate_input': input_padded})

    # Get the predicted label (index) for each task
    emotion_pred_index = np.argmax(predictions[0], axis = 1)[0]
    violence_pred_index = np.argmax(predictions[1], axis = 1)[0]
    hate_pred_index = np.argmax(predictions[2], axis = 1)[0]


    #determine major label
    major_labels= ['Emotion', 'Violence', 'Hate']
    major_label_index = np.argmax([np.max(predictions[0]), np.max(predictions[1]), np.max(predictions[2])])
    major_label= major_labels[major_label_index]

    #determing sub - labels
    emotion_labels_text=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    violence_labels_text=['sexual_violence', 'physical_violence','emotional_violence','Harmful_traditional_practice','economic_violence']
    hate_labels_text=['offensive speech', 'Neither', 'Hate speech']

    if major_label == 'Emotion':
        sub_label = emotion_labels_text[emotion_pred_index]
    elif major_label == 'Violence':
        sub_label = violence_labels_text[violence_pred_index]
    else:
        sub_label = hate_labels_text[hate_pred_index]

    return major_label, sub_label

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
#define a text widget and a placeholder
input_text_widget =widgets.Text(
    description = 'Input text : ',
    placeholder = 'Enter your text'
)


In [None]:
#define classify button
button = widgets.Button(description = 'Classify')


In [None]:
#define an output area to display result
output = widgets.Output()

In [None]:
#function to handle event
def on_button_click(b):
  with output:
    output.clear_output()  #clear the previous output
    input_text = input_text_widget.value
    major_label, sub_label = classify_text(input_text)
    print(f'Major Label: {major_label}')
    print(f'Sub Label: {sub_label}')

#attach the button at bottom
button.on_click(on_button_click)


In [None]:
display(input_text_widget, button, output)