In [35]:
import pandas as pd
import numpy as np
import time
import os
import logging
from datetime import datetime
from datasets import load_dataset
from openai import OpenAI
from tqdm.notebook import tqdm
from google.genai import types
from dotenv import load_dotenv

In [2]:
ds = load_dataset("mrjunos/depression-reddit-cleaned")
train = ds["train"]

df = pd.DataFrame(train)

In [3]:
load_dotenv()
openrouter_client = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=os.getenv("OPENROUTER_API_KEY"))

In [82]:
def generate_prompt(input_text):
    return f"""
    You are a highly intelligent mental health professional tasked with identifying whether a given text exert from an individual shows signs of depression.
 You must use likert scale to rate the severity of the depression. The ranges are 0 to 4 with 0 being no depression, 1 being mild depression, 2 being moderate depression, 3 being severe depression, and 4 being extreme depression. You must also provide a brief explanation of why you rated the individual as such. 
Input:
{input_text}
Output:
"""

In [97]:
def get_depression_severity_openrouter(df, model, num_samples=None):
    """
    Process texts from dataframe and get depression severity ratings from OpenRouter
    
    Args:
        df: DataFrame containing text data
        model: Model name to use with OpenRouter
        num_samples: Number of samples to process (default None to process all rows)
    
    Returns:
        DataFrame with original data, severity ratings and explanations
    """
    # Set num_samples to the total number of rows if None
    if num_samples is None:
        num_samples = df.shape[0]
    
    # Create a copy to avoid modifying the original dataframe
    result_df = df.head(num_samples).copy()
    
    # Initialize columns if they don't exist
    if 'severity_rating' not in result_df.columns:
        result_df['severity_rating'] = None
    if 'explanation' not in result_df.columns:
        result_df['explanation'] = None

    if model is None:
        raise ValueError("Model name must be provided.")
    
    # Filter for rows that need processing (where severity_rating is None/NaN)
    rows_to_process = result_df[result_df['severity_rating'].isna()].index
    
    for idx in tqdm(rows_to_process, total=len(rows_to_process), 
                   desc="Processing samples"):
        text = result_df.loc[idx, 'text']
        prompt = generate_prompt(text)
        
        try:
            response = openrouter_client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a highly intelligent mental health professional tasked with identifying whether a given text exert from an individual shows signs of depression."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ]
            )
            result = response.choices[0].message.content
            
            # Try to extract rating and explanation
            try:
                # Assuming the response is formatted with a numerical rating followed by explanation
                lines = result.strip().split('\n')
                rating_text = next((line for line in lines if any(str(i) in line for i in range(5))), '')
                rating = next((int(i) for i in range(5) if str(i) in rating_text), None)
                
                # Get explanation (everything after the rating)
                explanation = '\n'.join(lines[lines.index(rating_text) + 1:]) if rating_text in lines else result
                
                result_df.at[idx, 'severity_rating'] = rating
                result_df.at[idx, 'explanation'] = explanation.strip()
            except:
                # If parsing fails, store the full response in explanation
                result_df.at[idx, 'severity_rating'] = None
                result_df.at[idx, 'explanation'] = result
                
            # Only apply rate limiting for larger sample sizes
            if len(rows_to_process) > 20:
                time.sleep(3)  # Rate limiting
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            result_df.at[idx, 'severity_rating'] = None
            result_df.at[idx, 'explanation'] = f"Error: {str(e)}"
    
    return result_df


In [63]:
def cleanup_df(df):
    """
    Clean up the DataFrame by removing rows with NaN values in 'severity_rating' and 'explanation' columns.

    Args:
        df: DataFrame to clean.

    Returns:
        Cleaned DataFrame.
    """
    cleanup_df = df.copy()
    # Replace NaN wih '' in 'severity_rating' and 'explanation' columns
    # Also, remove error messages from 'explanation' column and replace with ''
    # More efficient way to handle missing values using fillna
    # Handle NaN values in severity_rating column
    cleanup_df['severity_rating'] = cleanup_df['severity_rating'].fillna('')
    cleanup_df['explanation'] = cleanup_df['explanation'].fillna('')
    
    # Convert empty strings back to NaN for severity_rating if we need it as numeric later
    # This is needed because the next part of the function tries to convert to numeric
    mask = cleanup_df['severity_rating'] == ''
    cleanup_df.loc[mask, 'severity_rating'] = np.nan
    # Use a more robust pattern matching approach for error messages
    error_patterns = ['API Error:', 'Parsing Error:']
    
    # Process each error pattern and replace it
    for pattern in error_patterns:
        mask = cleanup_df['explanation'].str.contains(pattern, na=False)
        #replace entire explanation with '' if error pattern is found
        cleanup_df.loc[mask, 'explanation'] = ''
    
    # Also handle missing values and convert severity_rating to proper numeric format where possible
    cleanup_df['severity_rating'] = pd.to_numeric(cleanup_df['severity_rating'], errors='coerce')
    return cleanup_df

In [84]:
ds_df = pd.read_csv("data/deepseek_extra_rows_500-1.csv")

In [85]:
ds_df = cleanup_df(ds_df)

In [92]:
ds_df['severity_rating'] = ds_df['severity_rating'].fillna('')

# get rows with '' in severity_rating  and explanation columns and save to new dataframe
missing_df = ds_df[(ds_df['severity_rating'] == '') & (ds_df['explanation'] == '')]
# drop severity_rating and explanation columns from missing_df
missing_df = missing_df.drop(columns=['severity_rating', 'explanation'])
missing_df.head()

Unnamed: 0,text,label
379,had the worst dream abt some turd face ex ugh ...,0
380,i posted on the self harm sub a to why you can...,1
381,not regularly but sometimes i experience an ex...,1
382,simon felice is no longer playing with the fel...,0
383,nobody ever speaks to me now,0


In [96]:
missing_df.to_csv("data/missing_df.csv", index=False)

In [95]:
bruh = get_depression_severity_openrouter(
    missing_df,
    model="deepseek/deepseek-chat:free",
    num_samples=missing_df.shape[0]  # Process all rows in the DataFrame
)
bruh.to_csv("data/2-deepseek_extra_rows_500.csv", index=False)

Processing samples:   0%|          | 0/121 [00:00<?, ?it/s]

2025-04-02 02:57:57,167 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-02 02:57:57,696 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 379: 'NoneType' object is not subscriptable


2025-04-02 02:57:58,213 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 380: 'NoneType' object is not subscriptable


2025-04-02 02:57:58,814 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 381: 'NoneType' object is not subscriptable


2025-04-02 02:57:59,319 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 382: 'NoneType' object is not subscriptable


2025-04-02 02:57:59,833 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 383: 'NoneType' object is not subscriptable


2025-04-02 02:58:00,342 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 384: 'NoneType' object is not subscriptable


2025-04-02 02:58:00,880 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 385: 'NoneType' object is not subscriptable


2025-04-02 02:58:01,381 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 386: 'NoneType' object is not subscriptable


2025-04-02 02:58:01,881 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 387: 'NoneType' object is not subscriptable


2025-04-02 02:58:02,404 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 388: 'NoneType' object is not subscriptable


2025-04-02 02:58:02,910 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 389: 'NoneType' object is not subscriptable


2025-04-02 02:58:03,419 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 390: 'NoneType' object is not subscriptable


2025-04-02 02:58:04,028 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 391: 'NoneType' object is not subscriptable


2025-04-02 02:58:04,553 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 392: 'NoneType' object is not subscriptable


2025-04-02 02:58:05,156 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 393: 'NoneType' object is not subscriptable


2025-04-02 02:58:05,679 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 394: 'NoneType' object is not subscriptable


2025-04-02 02:58:06,284 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 395: 'NoneType' object is not subscriptable


2025-04-02 02:58:06,792 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 396: 'NoneType' object is not subscriptable


2025-04-02 02:58:07,323 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 397: 'NoneType' object is not subscriptable
Error processing sample 398: 'NoneType' object is not subscriptable


2025-04-02 02:58:07,970 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-02 02:58:08,534 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 399: 'NoneType' object is not subscriptable


2025-04-02 02:58:09,060 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 400: 'NoneType' object is not subscriptable


2025-04-02 02:58:09,559 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 401: 'NoneType' object is not subscriptable


2025-04-02 02:58:10,074 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 402: 'NoneType' object is not subscriptable


2025-04-02 02:58:10,582 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 403: 'NoneType' object is not subscriptable


2025-04-02 02:58:11,081 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 404: 'NoneType' object is not subscriptable


2025-04-02 02:58:11,614 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 405: 'NoneType' object is not subscriptable


2025-04-02 02:58:12,126 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Error processing sample 406: 'NoneType' object is not subscriptable


KeyboardInterrupt: 

In [6]:
extra_rows_500 = pd.read_csv("data/extra_rows_500.csv")

In [7]:
if not os.path.exists("data/deepseek_extra_rows_500.csv"):
    # Get the depression severity ratings for the extra rows
    extra_rows_500_deepseek = get_depression_severity_openrouter(extra_rows_500, model="deepseek/deepseek-chat:free", num_samples=extra_rows_500.shape[0])
    extra_rows_500_deepseek.to_csv("data/deepseek_extra_rows_500.csv", index=False)
else:
    # Load the existing results
    print("Loading existing results...")
    extra_rows_500_deepseek = pd.read_csv("data/deepseek_extra_rows_500.csv")

Processing samples:   0%|          | 0/500 [00:00<?, ?it/s]

Error processing sample 180: 'NoneType' object is not subscriptable
Error processing sample 181: 'NoneType' object is not subscriptable
Error processing sample 182: 'NoneType' object is not subscriptable
Error processing sample 183: 'NoneType' object is not subscriptable
Error processing sample 184: 'NoneType' object is not subscriptable
Error processing sample 185: 'NoneType' object is not subscriptable
Error processing sample 186: 'NoneType' object is not subscriptable
Error processing sample 187: 'NoneType' object is not subscriptable
Error processing sample 188: 'NoneType' object is not subscriptable
Error processing sample 189: 'NoneType' object is not subscriptable
Error processing sample 190: 'NoneType' object is not subscriptable
Error processing sample 191: 'NoneType' object is not subscriptable
Error processing sample 192: 'NoneType' object is not subscriptable
Error processing sample 193: 'NoneType' object is not subscriptable
Error processing sample 194: 'NoneType' object i

In [21]:
deepseek_df = pd.read_csv("data/deepseek_extra_rows_500.csv")

# add more stuff from extra_rows_500 to deepseek_df
deepseek_df = pd.concat([deepseek_df, extra_rows_500], ignore_index=True)
# remove duplicates from deepseek_df
deepseek_df = deepseek_df.drop_duplicates(subset=['text'])
# reset index
deepseek_df = deepseek_df.reset_index(drop=True)


In [22]:
deepseek_df.shape[0]

500

In [25]:
# change NaN values to nothing in severity_rating and explanation columns
deepseek_df['severity_rating'] = deepseek_df['severity_rating'].fillna('')
deepseek_df['explanation'] = deepseek_df['explanation'].fillna('')

In [34]:
if not os.path.exists("data/deepseek_extra_rows_500-1.csv"):
    # Get the depression severity ratings for the extra rows
    extra_rows_500_deepseek = get_depression_severity_openrouter(deepseek_df, model="deepseek/deepseek-chat:free", num_samples=deepseek_df.shape[0])
    extra_rows_500_deepseek.to_csv("data/deepseek_extra_rows_500-1.csv", index=False)
else:
    # Load the existing results
    print("Loading existing results...")
    extra_rows_500_deepseek = pd.read_csv("data/deepseek_extra_rows_500-1.csv")

Preparing to process the first 500 rows.
Original 'severity_rating' value counts (including potential blanks):
severity_rating
       320
0.0     72
3.0     35
1.0     33
4.0     23
Name: count, dtype: int64

'severity_rating' value counts after replacing blanks and coercing:
severity_rating
NaN    320
0.0     72
3.0     35
1.0     33
4.0     23
Name: count, dtype: int64
Column dtype after coercion: float64

Target DataFrame size: 500
Indices needing processing: [180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 28

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_df['severity_rating'].replace(missing_values, np.nan, inplace=True)
  result_df['severity_rating'].replace(missing_values, np.nan, inplace=True)


Processing samples:   0%|          | 0/320 [00:00<?, ?it/s]

Error processing index 379 (API call failed): 'NoneType' object is not subscriptable
Error processing index 380 (API call failed): 'NoneType' object is not subscriptable
Error processing index 381 (API call failed): 'NoneType' object is not subscriptable
Error processing index 382 (API call failed): 'NoneType' object is not subscriptable
Error processing index 383 (API call failed): 'NoneType' object is not subscriptable
Error processing index 384 (API call failed): 'NoneType' object is not subscriptable
Error processing index 385 (API call failed): 'NoneType' object is not subscriptable
Error processing index 386 (API call failed): 'NoneType' object is not subscriptable
Error processing index 387 (API call failed): 'NoneType' object is not subscriptable
Error processing index 388 (API call failed): 'NoneType' object is not subscriptable
Error processing index 389 (API call failed): 'NoneType' object is not subscriptable
Error processing index 390 (API call failed): 'NoneType' object i

In [98]:
incomplete_500 = pd.read_csv("data/deepseek_extra_rows_500-1.csv")
remaining_df = pd.read_csv("data/remaining_missing_df_deepseek.csv")
final_df = pd.read_csv("data/deepseek-v3_results.csv")

In [99]:
# drop rows were severity_rating and explanation are empty in incomplete_500
incomplete_500 = incomplete_500[(incomplete_500['severity_rating'] != '') | (incomplete_500['explanation'] != '')]
# drop rows were severity_rating and explanation are empty in remaining_df
remaining_df = remaining_df[(remaining_df['severity_rating'] != '') | (remaining_df['explanation'] != '')]
# drop rows were severity_rating and explanation are empty in final_df
final_df = final_df[(final_df['severity_rating'] != '') | (final_df['explanation'] != '')]

In [100]:
# join all three dataframes on text column and keep all columns
final_df = pd.concat([incomplete_500, remaining_df, final_df], ignore_index=True)
# remove duplicates from final_df
final_df = final_df.drop_duplicates(subset=['text'])
# reset index
final_df = final_df.reset_index(drop=True)

In [101]:
final_df.shape[0]

525

In [102]:
final_df.to_csv("data/deepseek-v3_results-525.csv", index=False)