# Step 1: Load Dataset


In [None]:
import zipfile
import os
import pandas as pd

# Path to the ZIP file
zip_file_path = '/content/drive/My Drive/Deep learning/persian-tweets-emotional-dataset.zip'
extract_dir = '/content/persian_tweets_dataset/'

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Load each CSV file and concatenate them into a single DataFrame
emotion_files = ['anger.csv', 'disgust.csv', 'fear.csv', 'joy.csv', 'sad.csv', 'surprise.csv']
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sad', 'surprise']

df_list = []
for file, label in zip(emotion_files, emotion_labels):
    temp_df = pd.read_csv(os.path.join(extract_dir, file))
    print(f"Columns in {file}: {temp_df.columns.tolist()}")
    temp_df['emotion'] = label
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
print("Combined DataFrame Columns:", df.columns.tolist())
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Deep learning/persian-tweets-emotional-dataset.zip'

# Step 2: Display a Bar Graph of Class Distribution


In [None]:
import matplotlib.pyplot as plt

# Display the class distribution
df['emotion'].value_counts().plot(kind='bar')
plt.xlabel('Emotion')
plt.ylabel('Number of examples')
plt.title('Class distribution in Persian Tweets Emotional Dataset')
plt.show()


# Step 3: Text Preprocessing


In [None]:
!pip install hazm


In [None]:
import re
from hazm import Normalizer, word_tokenize, Stemmer

# Initialize the normalizer and stemmer
normalizer = Normalizer()
stemmer = Stemmer()

def preprocess_text(text):
    # Step 1: Normalize text
    text = normalizer.normalize(text)

    # Step 2: Remove HTML tags and URLs
    text = re.sub(r'<.*?>', '', text)  # Removing HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Step 3: Remove repetitive characters
    text = re.sub(r'(.)\1+', r'\1', text)

    # Step 4: Tokenize text
    tokens = word_tokenize(text)

    # Step 5: Remove stop words (if needed, use a list of Persian stop words)
    stop_words = set(['این', 'آن', 'و', 'در', 'به'])  # Example stop words
    tokens = [word for word in tokens if word not in stop_words]

    # Step 6: Remove emojis (optional)
    text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)  # Remove emojis

    # Step 7: Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back to string
    text = ' '.join(stemmed_tokens)

    return text

# Check columns of the DataFrame to identify the correct text column
print("DataFrame Columns:", df.columns.tolist())

# Replace 'tweet' with the actual name of the text column in your DataFrame
text_column_name = 'tweet'  # Adjust based on your dataset

# Apply preprocessing to the entire dataset
df[text_column_name] = df[text_column_name].apply(preprocess_text)

print(df.head())


## Step 4: Convert Text to Numeric Format using BOW and ParsBERT



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, TFAutoModel

# Bag of Words (BOW)
vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
X_bow = vectorizer.fit_transform(df['tweet']).toarray()  # Use 'tweet' instead of 'text'

# ParsBERT
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
parsbert = TFAutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

X_parsbert = encode_texts(df['tweet'].tolist())  # Use 'tweet' instead of 'text'

print("BOW shape:", X_bow.shape)
print("ParsBERT input shape:", X_parsbert['input_ids'].shape)


## Step 5: Obtain Embedding Vectors


In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

# Initialize ParsBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
parsbert = TFAutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

# Encode texts using ParsBERT tokenizer
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

# Example: Encode texts from the 'tweet' column in your DataFrame 'df'
X_parsbert = encode_texts(df['tweet'].tolist())

# Get embeddings from ParsBERT
embeddings = parsbert(X_parsbert['input_ids'], attention_mask=X_parsbert['attention_mask']).last_hidden_state
embeddings = tf.reduce_mean(embeddings, axis=1).numpy()  # Take mean of token embeddings

print("Embedding shape:", embeddings.shape)

# The default dimension of ParsBERT embeddings
default_dimension = embeddings.shape[1]
print("Default embedding dimension:", default_dimension)

# Explanation of embedding vectors and relationships
print("Embedding vectors represent words as vectors in a high-dimensional space where semantic relationships are captured.")
print("Words with similar meanings or usage patterns will have vectors that are close to each other in this space.")


## Step 6: Building the Model with Train/Test Split


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['emotion'], test_size=0.3, random_state=42)
X_train_parsbert, X_test_parsbert, y_train_parsbert, y_test_parsbert = train_test_split(embeddings, df['emotion'], test_size=0.3, random_state=42)

# Further split training set into training and validation sets
X_train_bow, X_val_bow, y_train, y_val = train_test_split(X_train_bow, y_train, test_size=0.2, random_state=42)
X_train_parsbert, X_val_parsbert, y_train_parsbert, y_val_parsbert = train_test_split(X_train_parsbert, y_train_parsbert, test_size=0.2, random_state=42)

print("Training set size (BOW):", X_train_bow.shape)
print("Validation set size (BOW):", X_val_bow.shape)
print("Test set size (BOW):", X_test_bow.shape)

print("Training set size (ParsBERT):", X_train_parsbert.shape)
print("Validation set size (ParsBERT):", X_val_parsbert.shape)
print("Test set size (ParsBERT):", X_test_parsbert.shape)


## Step 7: Create and Train Models


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dense

def create_cnn_lstm_model(input_shape):
    input_layer = Input(shape=input_shape)
    conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(input_layer)
    pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)
    lstm_layer = LSTM(128)(pooling_layer)
    output_layer = Dense(len(emotion_labels), activation='softmax')(lstm_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

cnn_lstm_model = create_cnn_lstm_model((X_train_parsbert.shape[1], 1))
cnn_lstm_model.summary()

history = cnn_lstm_model.fit(
    X_train_parsbert, y_train_parsbert,
    validation_data=(X_val_parsbert, y_val_parsbert),
    epochs=10,  # Adjust the number of epochs as needed
    batch_size=32  # Adjust the batch size as needed
)

# Evaluate the model
cnn_lstm_model.evaluate(X_test_parsbert, y_test_parsbert)


## Simple CNN and LSTM Models


In [None]:
# Simple CNN model
def create_cnn_model(input_shape):
    input_layer = Input(shape=input_shape)
    conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(input_layer)
    pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)
    flatten_layer = tf.keras.layers.Flatten()(pooling_layer)
    output_layer = Dense(len(emotion_labels), activation='softmax')(flatten_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model((X_train_parsbert.shape[1], 1))
cnn_model.summary()

cnn_history = cnn_model.fit(
    X_train_parsbert, y_train_parsbert,
    validation_data=(X_val_parsbert, y_val_parsbert),
    epochs=10,  # Adjust the number of epochs as needed
    batch_size=32  # Adjust the batch size as needed
)

# Evaluate the model
cnn_model.evaluate(X_test_parsbert, y_test_parsbert)

# Simple LSTM model
def create_lstm_model(input_shape):
    input_layer = Input(shape=input_shape)
    lstm_layer = LSTM(128)(input_layer)
    output_layer = Dense(len(emotion_labels), activation='softmax')(lstm_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

lstm_model = create_lstm_model((X_train_parsbert.shape[1], 1))
lstm_model.summary()

lstm_history = lstm_model.fit(
    X_train_parsbert, y_train_parsbert,
    validation_data=(X_val_parsbert, y_val_parsbert),
    epochs=10,  # Adjust the number of epochs as needed
    batch_size=32  # Adjust the batch size as needed
)

# Evaluate the model
lstm_model.evaluate(X_test_parsbert, y_test_parsbert)


## Step 8: Evaluate and Compare Models


In [None]:
from sklearn.metrics import classification_report

# Predict with each model
y_pred_cnn_lstm = cnn_lstm_model.predict(X_test_parsbert).argmax(axis=1)
y_pred_cnn = cnn_model.predict(X_test_parsbert).argmax(axis=1)
y_pred_lstm = lstm_model.predict(X_test_parsbert).argmax(axis=1)

# Evaluation reports
print("CNN-LSTM Model Report:")
print(classification_report(y_test_parsbert, y_pred_cnn_lstm, target_names=emotion_labels))

print("CNN Model Report:")
print(classification_report(y_test_parsbert, y_pred_cnn, target_names=emotion_labels))

print("LSTM Model Report:")
print(classification_report(y_test_parsbert, y_pred_lstm, target_names=emotion_labels))

# Weighted, micro, and macro averaging
def evaluate_model(y_true, y_pred):
    report = classification_report(y_true, y_pred, target_names=emotion_labels, output_dict=True)
    weighted_avg = report['weighted avg']
    micro_avg = report['micro avg']
    macro_avg = report['macro avg']
    return weighted_avg, micro_avg, macro_avg

weighted_avg_cnn_lstm, micro_avg_cnn_lstm, macro_avg_cnn_lstm = evaluate_model(y_test_parsbert, y_pred_cnn_lstm)
weighted_avg_cnn, micro_avg_cnn, macro_avg_cnn = evaluate_model(y_test_parsbert, y_pred_cnn)
weighted_avg_lstm, micro_avg_lstm, macro_avg_lstm = evaluate_model(y_test_parsbert, y_pred_lstm)

print("CNN-LSTM Model Averages:")
print("Weighted Average:", weighted_avg_cnn_lstm)
print("Micro Average:", micro_avg_cnn_lstm)
print("Macro Average:", macro_avg_cnn_lstm)

print("CNN Model Averages:")
print("Weighted Average:", weighted_avg_cnn)
print("Micro Average:", micro_avg_cnn)
print("Macro Average:", macro_avg_cnn)

print("LSTM Model Averages:")
print("Weighted Average:", weighted_avg_lstm)
print("Micro Average:", micro_avg_lstm)
print("Macro Average:", macro_avg_lstm)
