In [None]:
import kagglehub

path = kagglehub.dataset_download("thedevastator/nlp-mental-health-conversations")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('data/counseling_conversations.csv')

# Basic exploration
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Null values:\n{df.isnull().sum()}")

# Sample conversations
df.head()

In [None]:
# Response length distribution
df['response_length'] = df['Response'].str.len()
df['response_length'].hist(bins=50)

# Common patterns
print("Average response length:", df['response_length'].mean())
print("Sample responses by length:")
for percentile in [25, 50, 75]:
    idx = df['response_length'].quantile(percentile/100)
    print(f"{percentile}th percentile example:")
    print(df[df['response_length'].abs().sub(idx).abs().idxmin()]['Response'][:200])

In [None]:
from sentence_transformers import SentenceTransformer

# Test on 5 samples
model = SentenceTransformer('all-MiniLM-L6-v2')
test_contexts = df['Context'].head(5).tolist()
embeddings = model.encode(test_contexts)
print(f"Embedding shape: {embeddings.shape}")