In [None]:
!pip install --upgrade torch
!pip install --upgrade torchaudio
!pip install --upgrade torchdata
!pip install --upgrade torchtext
!pip install --upgrade torchvision
!pip install --upgrade torch-xla
!pip install --upgrade transformers
!pip install --upgrade scikeras
# !pip install --upgrade tensorflow
!pip install tensorflow==2.15.1
!pip install keras-core
!pip install --upgrade scikeras

In [None]:
!pip install nltk
!pip install --upgrade nltk
!pip show nltk

import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('words')

In [None]:
# Shen Dataset
import glob
import json
import pandas as pd


directory_path = '/kaggle/input/mddl-shennn/Dataset/labeled/negative/data/tweet/'

file_paths = glob.glob(directory_path + '*.json')


texts = []
labels = []


for file_path in file_paths:
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        text = json_data['text'] 
        texts.append(text)
        labels.append(0)  


data_healthy = pd.DataFrame({'text': texts, 'label': labels})



directory_path = '/kaggle/input/mddl-shennn/Dataset/labeled/positive/data/tweet/'


file_paths = glob.glob(directory_path + '*.json')

texts = []
labels = []


for file_path in file_paths:
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        text = json_data['text']  
        texts.append(text)
        labels.append(1) 


data_depression = pd.DataFrame({'text': texts, 'label': labels})


data_shen = pd.concat([data_healthy, data_depression], axis=0, ignore_index=True)


data_shen = data_shen [data_shen ['text'] != ""]
data_shen_duplicates = data_shen.drop_duplicates()

data_shen_duplicates = data_shen_duplicates.reset_index(drop=True)


x_shen = data_shen_duplicates.text
y_shen = data_shen_duplicates.label

In [None]:
from collections import Counter

label_counts = Counter(y_shen)
print(label_counts)

In [None]:
import re
import unicodedata
from nltk.tokenize import RegexpTokenizer

import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize


def remove_usernames(text):
    username_pattern = re.compile(r'@[\w]+')
    text_without_usernames = username_pattern.sub('', text)
    return text_without_usernames


def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text_without_urls = url_pattern.sub('', text)
    return text_without_urls


def remove_hashtags(text):
    hashtag_pattern = re.compile(r'#\w+')
    text_without_hashtags = hashtag_pattern.sub('', text)
    return text_without_hashtags


def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  
        "\U0001F300-\U0001F5FF"  
        "\U0001F680-\U0001F6FF"  
        "\U0001F700-\U0001F77F"  
        "\U0001F780-\U0001F7FF"  
        "\U0001F800-\U0001F8FF"  
        "\U0001F900-\U0001F9FF"  
        "\U0001FA00-\U0001FA6F"  
        "\U0001FA70-\U0001FAFF"  
        "\U00002702-\U000027B0"  
        "\U000024C2-\U0001F251"  
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


english_words = set(words.words())

def remove_non_english_words(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word.lower() in english_words])




def remove_extra_spaces(sentence):
    sentence = sentence.strip()
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence


def remove_digits(st):
    st = ''.join([i for i in st if not i.isdigit()])
    return st

def digit_clean(st):
    st = remove_digits(st)
    return st

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)


def data_cleaning(text):
    text = str(text).lower()
    text = remove_urls(text)
    text = remove_usernames(text)
    text = remove_hashtags(text)
    text = remove_emojis(text)
    text = digit_clean(text)
    text = remove_punctuation(text)
    text = remove_non_english_words(text)
    text = remove_extra_spaces(text)
    return text

In [None]:
x_datacleaning = []
for i in range(len(x_shen)):
    x_datacleaning.append(data_cleaning(x_shen[i]))


In [None]:
# random_state=42
from sklearn.model_selection import train_test_split
X_train_validation, X_test, y_train_validation, y_test = train_test_split( x_datacleaning, y_shen, test_size=0.2, random_state=42)

In [None]:
import re

pattern = r"\b(im|i\swas|i\sam|ive\sbeen|i\shave\sbeen)\b.*?\b(diagnoses|diagnose|diagnosed)\b.*?\bdepression\b"

x_removekeyword = [re.sub(pattern, "", text, flags=re.IGNORECASE).strip() for text in X_train_validation]
test_remove_keyword = [re.sub(pattern, "", text, flags=re.IGNORECASE).strip() for text in X_test]

x_removekeyword = [sentence.replace('depression', '').replace('diagnosed', '') for sentence in x_removekeyword]
test_remove_keyword = [sentence.replace('depression', '').replace('diagnosed', '') for sentence in test_remove_keyword]


patterns_to_remove = [r'\bim\b', r'\bi was\b', r'\bi am\b', r'\bive been\b', r'\bi have been\b']

x_removekeyword = [re.sub('|'.join(patterns_to_remove), '', sentence, flags=re.IGNORECASE).strip() 
                   for sentence in x_removekeyword]
test_remove_keyword = [re.sub('|'.join(patterns_to_remove), '', sentence, flags=re.IGNORECASE).strip() 
                   for sentence in test_remove_keyword]

In [None]:
import torch
from transformers import BertTokenizer, BertModel



model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


texts =x_removekeyword

tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')


with torch.no_grad():
        outputs = model(**tokenized_texts)


sequence_output = outputs.last_hidden_state

del tokenized_texts
del outputs


print("Shape of sequence output:", sequence_output.shape)

In [None]:
input_shape = (sequence_output.shape[1], sequence_output.shape[2])
sequence_output_np = sequence_output.numpy()

In [None]:
import tensorflow as tf
from keras import layers, models
from scikeras.wrappers import KerasClassifier
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, Dropout, Dense, Flatten, LSTM,InputLayer, Attention,Input, Reshape
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score
from keras.regularizers import l2, l1
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from keras.activations import tanh
from keras.optimizers import *
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dropout, Dense, Attention, Reshape
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Reshape

In [None]:
X_train, x_validation, y_train, y_validation = train_test_split(sequence_output_np, y_train_validation, test_size=0.125)

In [None]:
#bilstm

from tensorflow.keras.layers import Lambda
model = Sequential()


model.add(Bidirectional(LSTM(units=32, activation='relu', return_sequences=True, kernel_regularizer=l2(0.01), input_shape=input_shape)))
model.add(Dropout(0.4))
model.add(Lambda(lambda x: Attention()([x, x])))
model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_data=(x_validation, y_validation))

In [None]:

import numpy as np

array1 = X_test
array2 = test_remove_keyword

half_len1 = len(array1) // 2
half_len2 = len(array2) // 2

first_half_array1 = array1[:half_len1]

second_half_array2 = array2[half_len2:]

arraytotal = np.concatenate((first_half_array1, second_half_array2))


model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)

max_sequence_length = sequence_output.shape[1]

texts = arraytotal

tokenized_texts = [tokenizer.encode(text, add_special_tokens=True)[:max_sequence_length] for text in texts]

padded_texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in tokenized_texts], batch_first=True, padding_value=tokenizer.pad_token_id)


attention_masks = torch.where(padded_texts != tokenizer.pad_token_id, torch.ones_like(padded_texts), torch.zeros_like(padded_texts))

inputs = {'input_ids': padded_texts, 'attention_mask': attention_masks}

with torch.no_grad():
    outputs = model_bert(**inputs)

x_test = outputs.last_hidden_state


padded_x_test = torch.nn.functional.pad(x_test, (0, 0, 0, max_sequence_length - x_test.shape[1], 0, 0), value=0)

print("Shape of sequence output:", padded_x_test.shape)


x_test_np = padded_x_test.numpy()

In [None]:
y_test_pred = model.predict(x_test_np)
y_test_pred_binary = (y_test_pred > 0.5).astype(int)

test_accuracy = accuracy_score(y_test, y_test_pred_binary)
print(f'Accuracy: {test_accuracy:.4f}')

test_f1_score = f1_score(y_test, y_test_pred_binary)
print(f'F1 Score: {test_f1_score:.4f}')

test_precision = precision_score(y_test, y_test_pred_binary)
print(f'Precision: {test_precision:.4f}')

test_recall = recall_score(y_test, y_test_pred_binary)
print(f'Recall: {test_recall:.4f}')


matrix = confusion_matrix(y_test, y_test_pred_binary)


TN = matrix[0, 0]
FP = matrix[0, 1]
FN = matrix[1, 0]
TP = matrix[1, 1]


specificity = TN / (TN + FP)
sensitivity = TP / (TP + FN)


error_rate = (FP + FN) / (TP + TN + FP + FN)

print("Specificity:", f'{specificity:.4f}')
print("Sensitivity:", f'{sensitivity:.4f}')
print("Error Rate:", f'{error_rate:.4f}')


class_accuracy = matrix.diagonal() / matrix.sum(axis=1)


for i, acc in enumerate(class_accuracy):
    print(f'Accuracy for class {i}: {acc:.4f}')
    