In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
import numpy as np


# Data

In [2]:
file = '../raw_data/Combined Data.csv'
data_df = pd.read_csv(file,index_col=0)
data_df.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [3]:
# overview
def get_df_info(df):
    print("\n\033[1mShape of DataFrame:\033[0m ", df.shape)
    print("\n\033[1mColumns in DataFrame:\033[0m ", df.columns.to_list())
    print("\n\033[1mData types of columns:\033[0m\n", df.dtypes)

    print("\n\033[1mInformation about DataFrame:\033[0m")
    df.info()

    print("\n\033[1mNumber of unique values in each column:\033[0m")
    for col in df.columns:
        print(f"\033[1m{col}\033[0m: {df[col].nunique()}")

    print("\n\033[Memory:\033[0m\n", data_df.memory_usage().sum()/1_000_000, 'MB')

    print("\n\033[1mNumber of null values in each column:\033[0m\n", df.isnull().sum())

    print("\n\033[1mNumber of duplicate rows:\033[0m ", df.duplicated().sum())

    print("\n\033[1mDescriptive statistics of DataFrame:\033[0m\n",)

    return df.describe().transpose()

get_df_info(data_df)


[1mShape of DataFrame:[0m  (53043, 2)

[1mColumns in DataFrame:[0m  ['statement', 'status']

[1mData types of columns:[0m
 statement    object
status       object
dtype: object

[1mInformation about DataFrame:[0m
<class 'pandas.core.frame.DataFrame'>
Index: 53043 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     53043 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB

[1mNumber of unique values in each column:[0m
[1mstatement[0m: 51073
[1mstatus[0m: 7

[Memory:[0m
 1.273032 MB

[1mNumber of null values in each column:[0m
 statement    362
status         0
dtype: int64

[1mNumber of duplicate rows:[0m  1944

[1mDescriptive statistics of DataFrame:[0m



Unnamed: 0,count,unique,top,freq
statement,52681,51073,what do you mean?,22
status,53043,7,Normal,16351


In [4]:
# cleaning (missing, duplicates)
data_df = data_df.dropna(axis=0).reset_index(drop=True)
data_df = data_df.drop_duplicates().reset_index(drop=True)
print(f'shape of data after deleting null values and duplicates : {data_df.shape}')

shape of data after deleting null values and duplicates : (51093, 2)


# EDA

## distribution of labels

In [None]:
print(f"\nUnique labels: {data_df['status'].unique()}")
print(f"\nLabel distribution(%):")
print(data_df['status'].value_counts(normalize=True)*100)

# # Visualization
plt.figure(figsize=(10, 6))
data_df['status'].value_counts().plot(kind='bar')
plt.xlabel('Mental Health Status')
plt.ylabel('Number of Samples')
plt.xticks(rotation=45)
plt.show()

## Text content

### function definitions

In [None]:
def create_word_count_distributions(df, statement_col:str, status_col:str, count_col:str, categories:list,x_lim: None|list = None):
    """Create word count distribution plots for each category

    df : dataframe (with raw statements or after cleaned statements)

    with column names given by
        staement_col : contains statements,
        status_col : contains labels like Anxiety, Normal etc.,
        count_col : numbers showing counts (may be word count or character count) per row,
    categories : categories we are interested in, these categories are also in the status_col
    x_limit : none or a list ([min_value, max_value]) to limit the numbers shown in x axis
    """
    # Create subplots
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()

    # Colors for each category
    # colors = plt.cm.Set3(np.linspace(0, 1, len(categories)))

    for i, category in enumerate(categories):
        if i < len(axes):
            # Filter data for this category
            category_data = df[df[status_col] == category][count_col]

            # Create histogram
            sns.histplot(category_data,ax=axes[i])
            #axes[i].hist(category_data, bins=30, alpha=0.7, color=colors[i], edgecolor='black')
            axes[i].set_title(f'{category.title()} - {count_col} Distribution',
                            fontsize=14, fontweight='bold')
            axes[i].set_xlabel('Number of Words')
            axes[i].set_ylabel('Frequency')

            #limit x axis range
            if x_lim is not None:
                axes[i].set_xlim(x_lim)

            # Add statistics text
            mean_words = category_data.mean()
            max_words = category_data.max()
            min_words = category_data.min()

            stats_text = f'Mean: {mean_words:.1f}\nMax: {max_words}\nMin: {min_words}'
            axes[i].text(0.7, 0.9, stats_text, transform=axes[i].transAxes,
                        bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                        verticalalignment='top', fontsize=10)

    # Hide unused subplots
    for j in range(len(categories), len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

def create_comparative_word_count_boxplot(df, statement_col:str, status_col:str,count_col:str, categories:list):
    """Create box plots comparing word counts across categories

    df : dataframe (with raw statements or after cleaned statements)

    with column names given by
        staement_col : contains statements,
        status_col : contains labels like Anxiety, Normal etc.,
        count_col : numbers showing counts (may be word count or character count) per row,
    categories : categories we are interested in, these categories are also in the status_col

    """

    # Create the plot
    plt.figure(figsize=(14, 8))

    # Box plot
    sns.boxplot(data=df, x=status_col, y=count_col, palette='Set2')
    plt.title('Word Count Distribution Comparison Across Categories',
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Category', fontsize=12)
    plt.ylabel('Number of Words', fontsize=12)
    plt.yscale('log')
    plt.xticks(rotation=45)

    # Add mean markers
    means = df.groupby(status_col)[count_col].mean()
    for i in categories:
        plt.scatter(i, means[i], color='red', s=100, marker='o', zorder=5)

    plt.tight_layout()
    plt.show()

def get_top_words_per_category(df, statement_col:str, status_col:str, categories:list, top_n=20) -> dict:
    """Extract top N words for each mental health category

    df : dataframe (with raw statements or after cleaned statements)

    with column names given by
        staement_col : contains statements,
        status_col : contains labels like Anxiety, Normal etc.,
    categories : categories we are interested in, these categories are also in the status_col
    top_n : return top n words
    """


    # def clean_text(text):
    #     """Clean and preprocess text"""
    #     if pd.isna(text):
    #         return ""
    #     # Convert to lowercase
    #     text = str(text).lower()
    #     # Remove special characters, keep only letters and spaces
    #     text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    #     # Remove extra whitespace
    #     text = ' '.join(text.split())
    #     return text

    # categories = df[label_col].unique()
    all_category_words = {}

    print("="*80)
    print("TOP WORDS ANALYSIS BY CATEGORY")
    print("="*80)

    for category in categories:
        print(f"\n--- TOP {top_n} WORDS FOR {category.upper()} ---")

        # Get texts for this category
        category_texts = df[df[status_col] == category][statement_col]

        # Combine
        all_text = ' '.join(category_texts)

        # Split into words and filter
        words = [word for word in all_text.split()]

        # Count words
        word_counts = Counter(words)
        top_words = word_counts.most_common(top_n)

        # Store results
        all_category_words[category] = dict(top_words)

        # Display results
        for i, (word, count) in enumerate(top_words, 1):
            print(f"{i:2d}. {word:<15} ({count:,} times)")
        # Combine texts
        all_text = ' '.join(category_texts)
        print(f"Total unique words in {category}: {len(word_counts):,}")
        print(f"Total words analyzed: {sum(word_counts.values()):,}")

    return all_category_words

def plot_cloud_of_words(df,statement_col:str, status_col:str,category:str):
    """make a cloud of words plot for a given category

    df : dataframe (with raw statements or after cleaned statements)

    with column names given by
        staement_col : contains statements,
        status_col : contains labels like Anxiety, Normal etc.,
    category : category we are interested in, these category is also in the status_col
    top_n : return top n words
    """

    # Get texts for the category over the whole corpus
    category_texts = data_df[data_df[status_col] == category][statement_col]
    combined_text = ' '.join(category_texts.astype(str))

    cloud_of_words = WordCloud(
            width=800,
            height=400,
            background_color='white',
            max_words=100,
            colormap='viridis',  # You can change colors: 'plasma', 'inferno', 'magma'
            relative_scaling=0.5,
            min_font_size=10
        ).generate(combined_text)

fig, axes = plt.subplots(figsize=(15,8))
axes.imshow(cloud_of_words, interpolation='bilinear')
# plt.title(f'{category_name.title()} - Most Common Words',
#               fontsize=20, fontweight='bold', pad=20)
plt.axis('off');

In [None]:
data_df['num_of_charac'] = data_df['statement'].str.len()
data_df['num_of_words'] = data_df['statement'].apply(lambda x : len(x.strip().split()))
data_df

In [None]:
data_df.groupby('status')['num_of_words'].agg(['min','max','mean'])

In [None]:
# calling different functions
create_word_count_distributions(data_df, 'statement', 'status','num_of_words', data_df['status'].unique(),[-10,1000])

In [None]:
create_comparative_word_count_boxplot(data_df, 'statement', 'status','num_of_words', data_df['status'].unique())
top_words_by_category = get_top_words_per_category(data_df, 'statement', 'status', data_df['status'].unique(),5)


In [None]:
plot_cloud_of_words(data_df,'statement', 'status','Anxiety')