In [1]:
import pandas as pd
from datasets import load_dataset
from openai import OpenAI
from google import genai
import time
import os
from tqdm.notebook import tqdm
from google.genai import types
from dotenv import load_dotenv

In [2]:
ds = load_dataset("mrjunos/depression-reddit-cleaned")
train = ds["train"]

df = pd.DataFrame(train)

In [3]:
df.head()

Unnamed: 0,text,label
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [4]:
test_df = pd.read_csv("data/test.csv")

In [5]:
test_df.head()

Unnamed: 0,text,label
0,lol what anna schmance i soo wan na meet up wi...,0
1,i have a tendency to ob over some thing that h...,1
2,im tired i m so tired a tiredness that i can t...,1
3,katyrullman this is why you need to not be ove...,0
4,watchin i m not there and missing heath ledger,0


In [6]:
load_dotenv()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
openrouter_client = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=os.getenv("OPENROUTER_API_KEY"))

In [7]:
# response = gemini_client.models.generate_content(
#     model="gemini-2.0-flash",
#     contents="Explain the concept of a blockchain.",
# )

# print(response.text)

In [27]:
# Define a prompt template
def generate_prompt(input_text):
    return f"""
    You are a highly intelligent mental health professional tasked with identifying whether a given text exert from an individual shows signs of depression.
 You must use likert scale to rate the severity of the depression. The ranges are 0 to 4 with 0 being no depression, 1 being mild depression, 2 being moderate depression, 3 being severe depression, and 4 being extreme depression. You must also provide a brief explanation of why you rated the individual as such. 
Input:
{input_text}
Output:
"""


# Define the function to call the OpenAI API
def get_depression_severity(df,model="gemini-2.0-flash",num_samples=df.shape[0]):
    """
    Process texts from dataframe and get depression severity ratings from Gemini
    
    Args:
        df: DataFrame containing text data
        num_samples: Number of samples to process (default 5 to avoid excessive API calls)
    
    Returns:
        DataFrame with original data, severity ratings and explanations
    """
    
    # Create a copy to avoid modifying the original dataframe
    result_df = df.head(num_samples).copy()
    result_df['severity_rating'] = None
    result_df['explanation'] = None
    
    for idx, row in tqdm(result_df.iterrows(), total=len(result_df), desc="Processing samples"):
        text = row['text']
        prompt = generate_prompt(text)
        
        # Count words and characters
        # num_words = len(str(text).split())
        # num_characters = len(str(text))
        # print(f"Sample {idx}: {num_words} words and {num_characters} characters")
        
        try:
            response = gemini_client.models.generate_content(
                model=model,
                contents=prompt,
                # config=types.GenerationConfig(
                #     system_instructions="You are a highly intelligent mental health professional tasked with identifying whether a given text exert from an individual shows signs of depression."
                # )
            )
            result = response.text
            
            # Try to extract rating and explanation
            try:
                # Assuming the response is formatted with a numerical rating followed by explanation
                lines = result.strip().split('\n')
                rating_text = next((line for line in lines if any(str(i) in line for i in range(5))), '')
                rating = next((int(i) for i in range(5) if str(i) in rating_text), None)
                
                # Get explanation (everything after the rating)
                explanation = '\n'.join(lines[lines.index(rating_text) + 1:]) if rating_text in lines else result
                
                result_df.at[idx, 'severity_rating'] = rating
                result_df.at[idx, 'explanation'] = explanation.strip()
            except:
                # If parsing fails, store the full response in explanation
                result_df.at[idx, 'severity_rating'] = None
                result_df.at[idx, 'explanation'] = result
                
            if num_samples > 20:
                time.sleep(3)
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            result_df.at[idx, 'severity_rating'] = None
            result_df.at[idx, 'explanation'] = f"Error: {str(e)}"
    
    return result_df

In [9]:
# same function as above but with openrouter
def get_depression_severity_openrouter(df, model, num_samples=df.shape[0]):
    """
    Process texts from dataframe and get depression severity ratings from OpenAI
    
    Args:
        df: DataFrame containing text data
        num_samples: Number of samples to process (default 5 to avoid excessive API calls)
    
    Returns:
        DataFrame with original data, severity ratings and explanations
    """
    # Create a copy to avoid modifying the original dataframe
    result_df = df.head(num_samples).copy()
    result_df['severity_rating'] = None
    result_df['explanation'] = None

    if model is None:
        raise ValueError("Model name must be provided.")
    
    for idx, row in tqdm(result_df.iterrows(), total=len(result_df), desc="Processing samples"):
        text = row['text']
        prompt = generate_prompt(text)
        
        # Count words and characters
        # num_words = len(str(text).split())
        # num_characters = len(str(text))
        # print(f"Sample {idx}: {num_words} words and {num_characters} characters")
        
        try:
            response = openrouter_client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a highly intelligent mental health professional tasked with identifying whether a given text exert from an individual shows signs of depression."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ]
            )
            result = response.choices[0].message.content
            # print(result)  # test first
            
            # Try to extract rating and explanation
            try:
                # Assuming the response is formatted with a numerical rating followed by explanation
                lines = result.strip().split('\n')
                rating_text = next((line for line in lines if any(str(i) in line for i in range(5))), '')
                rating = next((int(i) for i in range(5) if str(i) in rating_text), None)
                
                # Get explanation (everything after the rating)
                explanation = '\n'.join(lines[lines.index(rating_text) + 1:]) if rating_text in lines else result
                
                result_df.at[idx, 'severity_rating'] = rating
                result_df.at[idx, 'explanation'] = explanation.strip()
            except:
                # If parsing fails, store the full response in explanation
                result_df.at[idx, 'severity_rating'] = None
                result_df.at[idx, 'explanation'] = result
                
            # Only apply rate limiting for larger sample sizes
            if num_samples > 20:
                time.sleep(3)  # Rate limiting
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            result_df.at[idx, 'severity_rating'] = None
            result_df.at[idx, 'explanation'] = f"Error: {str(e)}"
    
    return result_df


In [10]:
if not os.path.exists("data/extra_rows.csv"):
    extra_rows = df[~df['text'].isin(test_df['text'])].head(20)
    extra_rows.to_csv("data/extra_rows.csv", index=False)
else:
    print("File extra_rows.csv already exists. Loading from file.")
    extra_rows = pd.read_csv("data/extra_rows.csv")

File extra_rows.csv already exists. Loading from file.


In [11]:
if not os.path.exists("data/gemini_extra_rows.csv"):
    extra_rows_gemini = get_depression_severity(extra_rows, num_samples=extra_rows.shape[0])
    extra_rows_gemini.to_csv("data/gemini_extra_rows.csv", index=False)
else:
    print("File gemini_extra_rows.csv already exists. Loading from file.")
    extra_rows_gemini = pd.read_csv("data/gemini_extra_rows.csv")

File gemini_extra_rows.csv already exists. Loading from file.


In [12]:
if not os.path.exists("data/openrouter_extra_rows.csv"):
    extra_rows_openrouter = get_depression_severity_openrouter(extra_rows, num_samples=extra_rows.shape[0], model="deepseek/deepseek-chat:free")
    extra_rows_openrouter.to_csv("data/openrouter_extra_rows.csv", index=False)
else:
    print("File openrouter_extra_rows.csv already exists. Loading from file.")
    extra_rows_openrouter = pd.read_csv("data/openrouter_extra_rows.csv")

File openrouter_extra_rows.csv already exists. Loading from file.


In [13]:
if not os.path.exists("data/test_results_openrouter.csv"):
    test_results_openrouter = get_depression_severity_openrouter(test_df, num_samples=test_df.shape[0], model="deepseek/deepseek-chat:free")
    test_results_openrouter.to_csv("data/test_results_openrouter.csv", index=False)
else:
    print("File test_results_openrouter.csv already exists. Loading from file.")
    test_results_openrouter = pd.read_csv("data/test_results_openrouter.csv")

File test_results_openrouter.csv already exists. Loading from file.


In [14]:
# get 500 more samples from df without duplicates from test_df and extra_rows, split labels (0 and 1) equally
if not os.path.exists("data/extra_rows_500.csv"):
    extra_rows_500 = df[~df['text'].isin(test_df['text']) & ~df['text'].isin(extra_rows['text'])].sample(n=500, random_state=42)
    extra_rows_500.to_csv("data/extra_rows_500.csv", index=False)
else:
    print("File extra_rows_500.csv already exists. Loading from file.")
    extra_rows_500 = pd.read_csv("data/extra_rows_500.csv")

File extra_rows_500.csv already exists. Loading from file.


In [15]:
# gemini
if not os.path.exists("data/gemini_extra_rows_500.csv"):
    extra_rows_gemini_500 = get_depression_severity(extra_rows_500, num_samples=extra_rows_500.shape[0])
    extra_rows_gemini_500.to_csv("data/gemini_extra_rows_500.csv", index=False)
else:
    print("File gemini_extra_rows_500.csv already exists. Loading from file.")
    extra_rows_gemini_500 = pd.read_csv("data/gemini_extra_rows_500.csv")

File gemini_extra_rows_500.csv already exists. Loading from file.


In [16]:
missing_df = pd.read_csv("data/missing_df.csv")

In [17]:
if not os.path.exists("data/remaining_missing_df_deepseek.csv"):
    remaining = get_depression_severity_openrouter(missing_df, num_samples=missing_df.shape[0], model="deepseek/deepseek-chat")
    remaining.to_csv("data/remaining_missing_df_deepseek.csv", index=False)
else:
    print("File remaining_missing_df_deepseek.csv already exists. Loading from file.")
    remaining = pd.read_csv("data/remaining_missing_df_deepseek.csv")

File remaining_missing_df_deepseek.csv already exists. Loading from file.


In [18]:
gemini_500 = pd.read_csv("data/gemini_extra_rows_500.csv")
gemini_25 = pd.read_csv("data/gemini-flash-2.0_results.csv")

In [19]:
gemini_525 = pd.concat([gemini_500, gemini_25], ignore_index=True)
gemini_525.shape[0]

525

In [None]:
if not os.path.exists("data/gemini-flash-2.0_results-525.csv"):
    gemini_525.to_csv("data/gemini-flash-2.0_results-525.csv", index=False)
else:
    print("File gemini-flash-2.0_results-525.csv already exists.")

In [21]:
# Create llama_525_untrained based on gemini_525, dropping the columns that won't be used
llama_525_untrained = gemini_525[['text', 'label']].copy()

In [22]:
if not os.path.exists("data/llama-3.3-70B_results-525.csv"):
    llama_525 = get_depression_severity_openrouter(llama_525_untrained, num_samples=llama_525_untrained.shape[0], model="meta-llama/llama-3.3-70b-instruct")
    llama_525.to_csv("data/llama-3.3-70B_results-525.csv", index=False)
else:
    print("File llama-3.3-70B_results-525.csv already exists. Loading from file.")
    llama_525 = pd.read_csv("data/llama-3.3-70B_results-525.csv")

File llama-3.3-70B_results-525.csv already exists. Loading from file.


### Evaluation

In [23]:
# File paths
gemini_output_path = "data/gemini-flash-2.0_results.csv"
deepseek_output_path = "data/deepseek-v3_results.csv"

# Check if test_results.csv exists
if not os.path.exists("data/test_results.csv"):
    print("File test_results.csv not found. Please run the previous cells to generate this file.")
else:
    test_results = pd.read_csv("data/test_results.csv")
    
    # Process and save Gemini results
    if not os.path.exists(gemini_output_path):
        # Combine gemini_extra_rows and test_results into one dataframe
        df_gemini_f2 = pd.concat([extra_rows_gemini, test_results], ignore_index=True)
        df_gemini_f2.to_csv(gemini_output_path, index=False)
        print(f"Saved Gemini results to {gemini_output_path}")
    else:
        print(f"File {gemini_output_path} already exists. Skipping.")
        df_gemini_f2 = pd.read_csv(gemini_output_path)
    
    # Process and save DeepSeek results
    if not os.path.exists(deepseek_output_path):
        # Combine openrouter_extra_rows and test_results_openrouter into one dataframe
        df_deepseek_v3 = pd.concat([extra_rows_openrouter, test_results_openrouter], ignore_index=True)
        df_deepseek_v3.to_csv(deepseek_output_path, index=False)
        print(f"Saved DeepSeek results to {deepseek_output_path}")
    else:
        print(f"File {deepseek_output_path} already exists. Skipping.")
        df_deepseek_v3 = pd.read_csv(deepseek_output_path)

File data/gemini-flash-2.0_results.csv already exists. Skipping.
File data/deepseek-v3_results.csv already exists. Skipping.


In [None]:
if not os.path.exists("data/untrained_525.csv"):
    untrained_525 = gemini_525[['text', 'label']].copy()
    untrained_525.to_csv("data/untrained_525.csv", index=False)
else:
    print("File untrained_525.csv already exists.")