In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install trl

In [None]:
import os
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from tqdm import tqdm
from warnings import filterwarnings

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob

from sklearn.model_selection import train_test_split

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    BertTokenizer,
    BertModel,
    BertForSequenceClassification
)

from datasets import Dataset, DatasetDict

from peft import LoraConfig, PeftConfig

from trl import SFTTrainer, setup_chat_format

import bitsandbytes as bnb

from torch.utils.data import Dataset, DataLoader

filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.width', 200)

# Preprocessing

In [None]:
data = pd.read_csv("/content/CombinedData.csv",index_col=0, sep=",")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data["status"].value_counts()

In [None]:
status_counts = data["status"].value_counts()

plt.figure(figsize=(8, 8))
status_counts.plot.pie(
    autopct='%1.1f%%',
    startangle=90,
    colors=plt.cm.Set3.colors,
    title="Status Distribution"
)
plt.ylabel("")
plt.show()

In [None]:
data.isna().sum()

In [None]:
data = data.dropna()

In [None]:
data.statement.duplicated(keep="first").value_counts()

In [None]:
data = data.drop_duplicates(subset=['statement'], keep="first")

In [None]:
target_count = 5000

balanced_data = data.groupby("status").apply(
    lambda x: x.sample(min(len(x), target_count), random_state=42)
).reset_index(drop=True)

print("Original Class Distribution:")
print(data["status"].value_counts())

print("\nBalanced Class Distribution:")
print(balanced_data["status"].value_counts())

In [None]:
balanced_data['statement_length'] = balanced_data['statement'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(balanced_data['statement_length'], bins=50, kde=True)
plt.title('Distribution of Statement Lengths')
plt.xlabel('Length of Statement_length')
plt.ylabel('Frequency')
plt.show()

In [None]:
balanced_data['words'] = [len(x.split()) for x in balanced_data['statement'].tolist()]

In [None]:
balanced_data[['words','statement']].head()

In [None]:
balanced_data['words'].describe()

In [None]:
min_statement_size=16
balanced_data[balanced_data["words"] < min_statement_size].count()

In [None]:
df = balanced_data[balanced_data['words'] > min_statement_size]

In [None]:
df['status'].value_counts()

In [None]:
def preprocess_text(text):
    text = text.lower()  #Lowercase text
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove links
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

df['statement'] = df['statement'].apply(lambda x: preprocess_text(x))

In [None]:
nltk.download('stopwords')

In [None]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = text.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))
    return text

df['statement'] = remove_stopwords(df['statement'])

In [None]:
rare_values = pd.Series(' '.join(df["statement"]).split()).value_counts()[-30:]
print(len(rare_values))
df["statement"] = df["statement"].apply(lambda x: " ".join(x for x in x.split() if x not in rare_values))

In [None]:
nltk.download('wordnet')

In [None]:
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
df['statement'] = df['statement'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
def generate_wordcloud(dataframe, text_column, group_column):

    groups = dataframe[group_column].unique()
    for group in groups:

        group_text = " ".join(dataframe[dataframe[group_column] == group][text_column].dropna())

        wordcloud = WordCloud(
            max_font_size=50,
            max_words=100,
            background_color="white"
        ).generate(group_text)

        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.title(f"WordCloud for {group}")
        plt.axis("off")
        plt.show()

generate_wordcloud(df, text_column="statement", group_column="status")

In [None]:
label_map = {
    'Normal': 0,
    'Depression': 1,
    'Suicidal': 2,
    'Anxiety': 3,
    'Bipolar': 4,
    'Stress': 5,
    'Personality disorder': 6
}
df["label"] = df["status"].map(label_map)

In [None]:
df.head()

In [None]:
df = df.drop(["statement_length", "words"], axis=1)

In [None]:
df= df.sample(frac=1)

In [None]:
df.count()

# BERT

In [None]:
train_x, test_x, train_y, test_y = train_test_split( df["statement"],
                                                     df["label"],
                                                     random_state=42)

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = bert_tokenizer(list(train_x), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = bert_tokenizer(list(test_x), truncation=True, padding=True, max_length=128, return_tensors="pt")

In [None]:
train_labels = torch.tensor(list(train_y))
test_labels = torch.tensor(list(test_y))

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from torch.optim import AdamW

bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)
bert_model.train()

optimizer = AdamW(bert_model.parameters(), lr=5e-5)

In [None]:
from torch.nn import CrossEntropyLoss

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)

loss_fn = CrossEntropyLoss()

for epoch in range(15):
    bert_model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)

        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1} tamamlandı, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report

bert_model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, predictions))

In [None]:
text = "My grades are always low. I am worried about my future."

inputs = bert_tokenizer(
    text,
    max_length=128,
    truncation=True,
    padding="max_length",
    return_tensors="pt"
)

In [None]:
bert_model.eval()

inputs = {key: val.to(device) for key, val in inputs.items()}

with torch.no_grad():
    outputs = bert_model(**inputs)
    logits = outputs.logits

predicted_class = torch.argmax(logits, dim=1).item()

print("Tahmin edilen sınıf:", predicted_class)

In [None]:
label_map_inverse = {v: k for k, v in label_map.items()}

predicted_label = label_map_inverse.get(predicted_class, "Unknown")
print("Tahmin edilen etiket:", predicted_label)

In [None]:
llama_input = f"Text: {text}\nPredicted Label: {predicted_label}\n\nSummarize the text based on its content and predicted label."
print("Llama Model Input:", llama_input)

In [None]:
bert_model.save_pretrained("./mental_model")

In [None]:
bert_tokenizer.save_pretrained("./mental_model")

# LLM

In [None]:
from huggingface_hub import login

login(HF_TOKEN)

In [None]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)

In [None]:
model = model.to("cuda")

In [None]:
inputs = tokenizer(llama_input, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Llama Model Output:", generated_text)

In [None]:
!pip install gradio

In [None]:
import gradio as gr

def classify_and_summarize(input_text):
    bert_inputs = bert_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
        logits = bert_outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
        predicted_class = label_map_inverse.get(predicted_class, "Unknown")
        predicted_label = f"Class {predicted_class}"

    llama_input = f"Text: {input_text}\nPredicted Label: {predicted_label}\n\nSummarize the text based on its content and predicted label."
    llama_inputs = tokenizer(llama_input, return_tensors="pt").to("cuda")

    with torch.no_grad():
        llama_outputs = model.generate(
            input_ids=llama_inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=256,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
    summary = tokenizer.decode(llama_outputs[0], skip_special_tokens=True)

    return f"Predicted Label: {predicted_label}\n\nSummary: {summary}"

with gr.Blocks() as demo:
    gr.Markdown("# Ruh Sağlığı Analizi ")
    gr.Markdown("Aşağıya şikayetinizi girin ve sınıflandırmak ve özet için düğmeye tıklayınız.")

    input_text = gr.Textbox(label="Enter your text here")

    classify_button = gr.Button("Classify and Summarize")

    output_text = gr.Textbox(label="Result")

    classify_button.click(
        fn=classify_and_summarize,
        inputs=[input_text],
        outputs=[output_text]
    )

demo.launch(debug=True)