In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import classification_report, silhouette_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

from scipy.stats import chi2_contingency, multinomial

import nltk
import re
nltk.download('punkt')
from nltk.tokenize import word_tokenize, RegexpTokenizer, regexp_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, LancasterStemmer, PorterStemmer
from nltk.probability import FreqDist
from nltk import WordNetLemmatizer, pos_tag
from nltk.util import bigrams
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
#nltk.download('all')
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.metrics import BigramAssocMeasures

import itertools
import string
import contractions

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from copy import deepcopy

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Loading the Data

Let's load our dataset and take a look at it.

In [2]:
data = pd.read_csv('data/Combined Data.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


Looks like we have some NaNs. Let's take a closer look.

In [5]:
data[pd.isna(data['statement'])]

Unnamed: 0.1,Unnamed: 0,statement,status
293,293,,Anxiety
572,572,,Anxiety
595,595,,Anxiety
1539,1539,,Normal
2448,2448,,Normal
...,...,...,...
52838,52838,,Anxiety
52870,52870,,Anxiety
52936,52936,,Anxiety
53010,53010,,Anxiety


There are so few NaN's compared to the number of rows in our entire dataframe. Furthermore, rows with no 'statement' value are useless to us. Let's drop them.

In [6]:
data = data.dropna(subset=['statement'])

Now let's check if there are duplicate rows. 

In [7]:
data['statement'].nunique()

51073

It looks like there are!

In [8]:
data[data['statement'].duplicated()]

Unnamed: 0.1,Unnamed: 0,statement,status
97,97,"""No regrets or grudges/angry at things that ha...",Anxiety
138,138,but my heart is still restless even though my ...,Anxiety
167,167,I want to exhale the restlessness in my chest ...,Anxiety
228,228,Do not compare yourself to others. Envy only m...,Anxiety
244,244,"people seem calm, happy like there's no proble...",Anxiety
...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


These statements all look pretty specific and personalized. AKA it does seem like our duplicate values are true duplicates of the same social media statements made by the same singular user. Furthermore, we have over 50,000 rows in our dataset and can afford to lose these rows. 

Let's drop duplicates as well.

In [9]:
# Removing 'statement' duplicates, keeping first instance
data = data.drop_duplicates(subset=['statement'], keep='first')

In [10]:
data[data['statement'].duplicated()]

Unnamed: 0.1,Unnamed: 0,statement,status


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51073 entries, 0 to 52840
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  51073 non-null  int64 
 1   statement   51073 non-null  object
 2   status      51073 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


Furthermore, our 'Unnamed: 0' column appears to just be a duplicate of our index. Let's explore. 

In [12]:
data['Unnamed: 0'].nunique()

51073

In [13]:
# Checking if 'Unnamed: 0' equals our index
is_identical = data['Unnamed: 0'].equals(data.index)
is_identical

False

Hm...We can safely assume this column doesn't provide information about multiple 'statement' values being generated by the same singular user...because our nunique() matches the total number of rows in our dataframe, which eliminates the possibility of duplicates. 

This column is most likely just an old index from previous datasets, which is no longer useful to use. We'll drop it for now. We can always go back and edit the code if we decide we need it later.

In [14]:
data.drop(columns=['Unnamed: 0'], inplace=True)

Let's look at the value_counts() for our target variable 'status'.

In [15]:
data['status'].value_counts()

status
Normal                  16039
Depression              15087
Suicidal                10641
Anxiety                  3617
Bipolar                  2501
Stress                   2293
Personality disorder      895
Name: count, dtype: int64

#### 'Personality disorder'

'Personality disorder' is an ambiguous label compared to the others. It might be a version of a 'placeholder' value for not 'Normal' users where further categorization of disorder (ex. 'Depression', 'Suicidal', 'Anxiety') was inconclusive. Alternatively, it could be indication of other, entirely different personality disorders not listed above. 

We'll keep this in mind and explore as we go. Depending on what we find, it might be helpful to drop these values when building a predictive model.

# Preprocessing

## Prior to Cleaning...

We'll want to clean the text in our data by performing operations that remove punctuation and special characters, lowercase text, remove newline '\n' characters, etc.

However, ***before*** we do this...let's look at whether or not we can use any of our 'pre-cleaned' text to create features that might be useful down the road.

#### Percentage Upper-Case

Let's think about whether or not any unique punctuation and/or text characteristics might be indicative of a personality disorder. 

*Upper-case* text might help identify whether or not a person is in some kind of distress. If user posts 'A MESSAGE ENTIRELY IN UPPER-CASE LIKE SO', that's an unusual behavior that we should try and quantify.

Let's create a new column 'perc_upper' that calculates the percentage of upper-case letters to the total number of letters in each 'statement'.

In [16]:
# Define function to calculate percentage of upper-case letters
def perc_upper(text):
    if not isinstance(text, str):
        return 0
    letters = re.findall(r'[a-zA-Z]', text) # Using regex to isolate letters
    if not letters: 
        return 0
    upper_count = len(re.findall(r'[A-Z]', text))
    return (upper_count / len(letters)) * 100

data['perc_upper'] = data['statement'].apply(perc_upper)

In [17]:
upper_sorted = data.sort_values(by='perc_upper', ascending=False)
upper_sorted.head()

Unnamed: 0,statement,status,perc_upper
3546,[HELP RT] WE FANBASE SHAKE RP! JOIN? FOLLOW FI...,Normal,100.0
2436,ONAKA GA SUITA,Normal,100.0
6787,TODAY NO CLASS YAAYYY,Normal,100.0
5303,6 HALF HOURS AGAINIII ULULU I ​​WANT TO SLEEP ...,Normal,100.0
2478,I DON'T HAVE A HOLIDAY AS WELL AS EVIL,Normal,100.0


#### METRIC CHANGE - Percentage by Upper-Case Words

On second thought, we should find a different way to quantify a notable amount of upper-case in a string of text. With our current 'perc_upper' value...a statement such as 'Hi' will have a 50% perc_upper. However this would be due to normal grammatical capitalization techniques that are of no note.

Let's alter our metric slightly, to calculate the **percentage of upper-case words compared to the total number of words in a statement**.

We'll stick with regex for now to tokenize our words and calculate this metric.

In [18]:
# Dropping perc_upper column before to reduce computation time
data.drop(columns=['perc_upper'], inplace=True)

# Updating perc_upper to calculate based on number of words
def perc_upper_words(text): 
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b[a-zA-Z\'-]+\b', text) # Including apostrophes and hyphens
    words = [word for word in words if not re.search(r'\d', word)] # Filter out numbers
    if not words:
        return 0
    uppercase_words = [word for word in words if word.isupper()]
    return len(uppercase_words) / len(words) * 100

data['perc_upper_words'] = data['statement'].apply(perc_upper_words)

In [19]:
upper_sorted = data.sort_values(by='perc_upper_words', ascending=False)
upper_sorted.head()

Unnamed: 0,statement,status,perc_upper_words
11210,I KEEP MESSING THINGS UP ALL DAY EVERY DAY THE...,Suicidal,100.0
5917,#PECAT WHOSE NAME IS SI ALI MOCHTAR NYEBELIN #,Normal,100.0
4173,OH MY GOD FEAR CANCEL,Normal,100.0
1581,HIS BD CARD KANON IS CUTE HSHSHSHS SAD NO FREE...,Normal,100.0
2811,YAALLAH SO NATION OF SM DREAMIES,Normal,100.0


In [20]:
# Sampling rows from the middle
filtered_rows = upper_sorted[(upper_sorted['perc_upper_words'] >= 50.0) & (upper_sorted['perc_upper_words'] <= 60.0)]
filtered_rows.head(20)

Unnamed: 0,statement,status,perc_upper_words
10953,NOBODY WANTS TO TALK TO ME Going to kill myself,Depression,60.0
498,"the habit of chatting for a day, gadicchat imm...",Anxiety,60.0
17851,I feel like I cannot even explain myself becau...,Suicidal,60.0
4525,NOT FUNNY MORNING â€ Runny nose,Normal,60.0
24841,I try my best but I just want to die. I hate m...,Depression,59.42029
6357,HAHAAAA GET THE ASSIGNMENT OF MAKING A SCIENTI...,Normal,57.894737
1113,WANT TO SIN BUT Afraid to Laugh,Normal,57.142857
36232,Everything is temporary. EVIL EYES OFF SHIVRIT,Normal,57.142857
830,MORNING AGAIN I've made a AMENDE SPACE,Normal,57.142857
48673,AAAAAAAAAAAAAAAAAA aaAAAAAAAAAA FIDBWJSKLA oaa...,Stress,57.142857


Let's see if there are any trends regarding the distribution of these numbers within each of our 'status' values.

In [21]:
upper_stats = data.groupby('status')['perc_upper_words'].agg(['mean', 'min', 'max'])
upper_stats

Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,5.511061,0.0,100.0
Bipolar,5.978278,0.0,25.0
Depression,5.66738,0.0,100.0
Normal,3.903474,0.0,100.0
Personality disorder,5.424887,0.0,27.118644
Stress,5.664143,0.0,57.142857
Suicidal,7.922907,0.0,100.0


Interesting. 'Normal' has the lowest average perc_upper_words with a mean of 3.90, whereas 'Suicidal' has the highest with a mean of 7.92.

Let's see if there are any noticeable trends for the same metric applied to lower-case words. A higher percentage of lower-case words might be indicative of disorders like 'Depression'.

In [22]:
# Updating perc_upper to calculate based on number of words
def perc_lower_words(text): 
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b[a-zA-Z\'-]+\b', text) # Including apostrophes and hyphens
    words = [word for word in words if not re.search(r'\d', word)] # Filter out numbers
    if not words:
        return 0
    lowercase_words = [word for word in words if word.islower()]
    return len(lowercase_words) / len(words) * 100

data['perc_lower_words'] = data['statement'].apply(perc_lower_words)

In [23]:
lower_stats = data.groupby('status')['perc_lower_words'].agg(['mean', 'min', 'max'])
lower_stats

Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,87.107075,0.0,100.0
Bipolar,86.984639,25.0,100.0
Depression,90.887056,0.0,100.0
Normal,89.098266,0.0,100.0
Personality disorder,87.738038,45.454545,100.0
Stress,87.853044,14.285714,100.0
Suicidal,87.43036,0.0,100.0


'Depression' does indeed have the highest mean compared with our other 'status' values. 

However, typing in all lower-case is far more socially "standard" than typing in caps-lock. People often remove automatic capitilization as a setting on their phone or computer, which sets their default text to all lower-case. 

Given that lower-case typing is a societal 'norm', and a personal preference people often make regardless of psychological state...the 'lower_stats' numbers above do not vary drastically between 'status' groups enough for us to be able to use this as a predictive metric. 

We'll drop it and just keep perc_upper_words. 

In [24]:
data.drop(columns=['perc_lower_words'], inplace=True)

#### Special Characters

Let's explore if it will be useful to apply similar logic to special characters and punctuation. Let's examine all the different special characters in our entire dataframe, excluding numbers.

In [25]:
# Searching for special characters, excluding numbers
def extract_special(text):
    if not isinstance(text, str):
        return ''
    return re.findall(r'[^\w\s\d]', text)

special_chars = data['statement'].apply(extract_special).explode().unique()

In [26]:
special_chars

array([nan, ',', '.', "'", '?', ':', ')', '(', '[', ']', '™', '‚', '=',
       '"', '/', '#', '!', '️', '±', '-', '…', '*', '”', '&', '€', '„',
       '«', '–', '¤', '<', '°', '»', '^', ';', '\u200b', '“', '+', '˜',
       '‹', '¥', '©', '£', '@', '~', '¸', '—', '$', '§', '•', '¡', '❤',
       '¨', '>', '‡', '|', '¯', '%', '¦', '{', '}', '’', '\\', '´', '`',
       '†', '¶', '·', '®', '¢', '‰', '¿', '¬', '›', '👩', '\u200d', '🎓',
       '😇', '\u200e', '➡', '💸', '😔', '🙂', '😦', '‘', '💕', '●', '🐰', '🥕',
       '💖', '😱', '😐', '✌', '🏻', '🙃', '☹', '😅', '😢', '😂', '😩', '😖', '↑',
       '👍', '🏼', '😭', '🤷', '♂', '⛅', '🛑', '🤦', '♀', '🙄', '😕', '😠', '😫',
       '😁', '\ufeff', '😒', '💩', '😜', '😨', '😆', '🏽', '😓', '\x80', '\x99',
       '\x9f', '\x98', '\xad', '\x92', '\x8f', '\x8b', '\x9c', '\x9d',
       '\x8d', '\x87', '\x82', '\x8a', '\x81', '\x8c', '\x91', '\x94',
       '\x84', '\x95', '\x96', '\x89', '\x83', '\x88', '\x97', '\x90',
       '\x9a', '\x8e', '\U0001fae0', '🙏', '🔃', '💀', '🤣', '\U0001f

#### Emojis

Let's try and categorize the emojis used. There are few enough that we can manually create lists identifying emojis we deem to carry clear 'positive' or 'negative' sentiments. We'll ignore any "neutral" emojis: we're about to get into more detailed and layered analysis of the actual text in a bit, so we don't need to waste time getting *too* intricate with these additional features!

If we have to choose one or the other, we'll probably choose **negative** emojis as our main predictive metric (with regards to emoji sentiments)...since positive emojis could be used sarcastically. 

The sarcastic use of positive "sentiments" to indicate negativity is far more common than the sarcastic use of negative sentiments to indicate positivity. 

##### Varying Cultural Expression

There can be differences in a person's interpretation and use of emojis when conveying sentiments, based on a variety of cultural factors. We don't know enough about the people who created this dataset, nor do we have demographic information on the users in the dataset itself, to make those determinations...We'll categorize emoji sentiments through our own cultural lens, and see how well our version of this metric helps or hurts our predictive model.

In [27]:
pos_emojis = ['😇', '💖', '🙂', '💕', '❤', '😅', '😂', '👍', '😁', '😆', '🤣', '🥰', '💜', '🤩', '🖤', '😌', '🥲', '🤍','💚','😊',
              '💗', '😍', '🧡', '💝', '♥', '💛', '🤗', '😀', '😹', '😏']

neg_emojis = ['😔', '😦', '😐', '😢', '😩', '😖', '😭', '🤦', '🙄', '😕', '😠', '😫', '😒', '😨', '😓', '😮', '😞', '😳', '😥',
              '😑', '🙁', '😣', '😪', '🤕', '💔']

##### Binary Classification

As we stated, we don't need to get too intricate. Let's create a binary classifier to simply determine whether a 'statement' has any number of positive or negative emojis in it's text. 

In [28]:
# 1 if any positive emojis in 'statement', else 0
def has_pos_emoji(text):
    if not isinstance(text, str):
        return 0
    return 1 if any(emoji in text for emoji in pos_emojis) else 0

data['pos_emoji'] = data['statement'].apply(has_pos_emoji)

In [29]:
# Same for negative emojis
def has_neg_emoji(text):
    if not isinstance(text, str):
        return 0
    return 1 if any(emoji in text for emoji in neg_emojis) else 0

data['neg_emoji'] = data['statement'].apply(has_neg_emoji)

In [30]:
# Define a function to count rows with certain values
def count_rows(df, pos_val, neg_val):
    filtered_df = df[(df['pos_emoji'] == pos_val) & (df['neg_emoji'] == neg_val)]
    return filtered_df.groupby('status').size()

print("Number of Rows with POSITIVE Emojis:")
print(count_rows(data, 1, 0))
print("")
print("Number of Rows with NEGATIVE Emojis:")
print(count_rows(data, 0, 1))
print("")
print("Number of Rows with BOTH:")
print(count_rows(data, 1, 1))

Number of Rows with POSITIVE Emojis:
status
Anxiety                 39
Bipolar                 17
Depression              10
Normal                  15
Personality disorder    22
Stress                   7
dtype: int64

Number of Rows with NEGATIVE Emojis:
status
Anxiety                 40
Bipolar                 12
Depression               8
Personality disorder    14
Stress                  13
dtype: int64

Number of Rows with BOTH:
status
Anxiety                 2
Depression              2
Personality disorder    4
dtype: int64


We were correct in assuming that the use of negative emojis might be a more telling metric than the use of positive emojis. There are no 'Normal' status users that have used any negative emojis whatsoever. The results for positive emojis are more ambiguous. Let's drop that column.

(There are a suprisingly few amount of rows in our dataset that contain emojis. Our neg_emoji column is probably useless as well, but we'll keep it for right now).

In [31]:
data.drop(columns=['pos_emoji'], inplace=True)

#### ! and ?

The last notable special characters we might want to document prior to text cleaning are exclamation points ('!') and question marks ('?'). These are frequently used to convey emotional sentiment, especially distress when used repeatedly (ex. '!!!!!', '????', '!?!?!?!?'). 

Let's add columns counting the number of times a person uses these in a 'statement'.

**Normalization might not be useful here**. Using special characters such as '!' and '?' more than once is already noteworthy, especially in brief statements made on social media. Identifying an excess count, regardless of statement length, is probably our best metric. 

We'll try normalizing first, to see what the numbers look like.

In [32]:
# Function to count characters excluding whitespace and calculate proportion of 'char'
def calculate_char_ratio(text, char):
    if not isinstance(text, str):
        return 0

    # Remove whitespace and count non-whitespace characters
    non_whitespace_text = re.sub(r'\s+', '', text)
    total_characters = len(non_whitespace_text)
    
    # Count occurrences of 'char'
    char_count = text.count(char)
    
    # Calculate proportion
    if total_characters == 0:
        return 0
    ratio = char_count / total_characters
    
    return ratio

# Apply the function to calculate proportions for '!' and '?'
data['exclamation_ratio'] = data['statement'].apply(lambda x: calculate_char_ratio(x, '!'))
data['question_ratio'] = data['statement'].apply(lambda x: calculate_char_ratio(x, '?'))

In [33]:
# Printing stats by 'status' group
exclamation_stats = data.groupby('status')['exclamation_ratio'].agg(['mean', 'min', 'max'])
print("Ratio of Exclamation Points by Status")
exclamation_stats

Ratio of Exclamation Points by Status


Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,0.000642,0.0,0.157895
Bipolar,0.000562,0.0,0.056604
Depression,0.000236,0.0,0.138298
Normal,0.002042,0.0,0.740741
Personality disorder,0.000318,0.0,0.026157
Stress,0.000396,0.0,0.031008
Suicidal,0.000417,0.0,0.246154


In [34]:
question_stats = data.groupby('status')['question_ratio'].agg(['mean', 'min', 'max'])
print("Ratio of Question Marks by Status")
question_stats

Ratio of Question Marks by Status


Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,0.002883,0.0,0.1
Bipolar,0.003458,0.0,0.090909
Depression,0.001452,0.0,0.166667
Normal,0.005444,0.0,0.466667
Personality disorder,0.003138,0.0,0.0625
Stress,0.001802,0.0,0.210811
Suicidal,0.001741,0.0,0.111111


Interesting. Just like we said, normalizing to account or statement length might not be ideal here. Our non-normalized counts might be easier to interpret. 

Let's run the same for merely the count of each character, not the ratio proportional to the total number of characters in the statement.

In [35]:
# Function to count occurrences of '!'
def count_exclamation(text):
    if not isinstance(text, str):
        return 0
    return text.count('!')

# Add 'exc_count' column to the DataFrame
data['exclamation_count'] = data['statement'].apply(count_exclamation)

# Printing stats by 'status' group
exccount_stats = data.groupby('status')['exclamation_count'].agg(['mean', 'min', 'max'])
print("Exclamation Point Count by Status")
exccount_stats

Exclamation Point Count by Status


Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,0.301355,0,27
Bipolar,0.301479,0,9
Depression,0.110824,0,39
Normal,0.098635,0,20
Personality disorder,0.178771,0,26
Stress,0.176625,0,14
Suicidal,0.116906,0,58


In [36]:
# Function to count occurrences of '!'
def count_question(text):
    if not isinstance(text, str):
        return 0
    return text.count('?')

# Add 'exc_count' column to the DataFrame
data['question_count'] = data['statement'].apply(count_question)

# Printing stats by 'status' group
questcount_stats = data.groupby('status')['question_count'].agg(['mean', 'min', 'max'])
print("Question Mark Count by Status")
questcount_stats

Question Mark Count by Status


Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,1.056953,0,15
Bipolar,1.565374,0,20
Depression,0.652151,0,47
Normal,0.170896,0,8
Personality disorder,1.264804,0,22
Stress,0.648059,0,39
Suicidal,0.616389,0,34


Yes!! These numbers are far more useful, and give us interesting insight that aligns with what we know about personality disorders. It makes a lot of sense that 'Suicidal' would have the highest maximum value of exclamation points...since an excess of exclamation points typically indicates high levels of distress or excitement. It also makes sense that 'Normal' has the lowest maximum value with regards to question mark counts...for similar reasons.

We won't want to jump to concrete conclusions based on personal perception and prior knowledge...we'll trying running our model and see how well these columns actually correlate to and are predictive of 'status'!

We'll keep these two new count features for now and drop our ratio columns. 

In [37]:
data.drop(columns=['exclamation_ratio', 'question_ratio'], inplace=True)

In [38]:
# Checking value_counts to see how useful new columns are
data['question_count'].value_counts()

question_count
0     35919
1      8260
2      3689
3      1627
4       680
5       354
6       212
7       116
8        66
9        39
10       31
11       24
12       12
14        9
16        7
13        7
15        3
17        2
18        2
21        2
19        2
22        2
23        2
20        2
31        1
47        1
34        1
39        1
Name: count, dtype: int64

In [39]:
data['exclamation_count'].value_counts()

exclamation_count
0     47503
1      2237
2       666
3       279
4       164
5        72
6        46
8        27
7        19
9        15
14        7
10        7
12        7
13        5
16        4
11        2
24        2
15        2
20        2
17        1
18        1
58        1
19        1
39        1
26        1
27        1
Name: count, dtype: int64

## Text Cleaning

Now let's clean our text by making everything lower-case, removing special characters, etc.

##### Contractions

We could include a step in our cleaning that prepares our text to analyze contractions (ex. "don't", "aren't", "they're") as *bigrams*. However there aren't any contraction words that are particuarly indicative of sentiment, especially with regards to personality disorders. We want to be detailed, without wasting time - so we will expand our contractions, but won't worry about providing any further analysis on them.

In [40]:
# Expanding contractions
data['statement'] = data['statement'].apply(lambda x: contractions.fix(x))

In [41]:
# Define function to remove emojis
def remove_emojis(text):
    # Regex pattern to match emojis (covers a wide range of emojis)
    emoji_pattern = re.compile(
        '['
        '\U0001F600-\U0001F64F'  # emoticons
        '\U0001F300-\U0001F5FF'  # symbols & pictographs
        '\U0001F680-\U0001F6FF'  # transport & map symbols
        '\U0001F700-\U0001F77F'  # alchemical symbols
        '\U0001F780-\U0001F7FF'  # Geometric Shapes Extended
        '\U0001F800-\U0001F8FF'  # Supplemental Arrows-C
        '\U0001F900-\U0001F9FF'  # Supplemental Symbols and Pictographs
        '\U0001FA00-\U0001FA6F'  # Chess Symbols
        '\U0001FA70-\U0001FAFF'  # Symbols and Pictographs Extended-A
        '\U00002700-\U000027BF'  # Dingbats
        '\U000024C2-\U0001F251' 
        ']+', 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Define function to preprocess text
def clean_text(text):
    if not isinstance(text, str):
        return""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove links
    text = re.sub(r'[^\w\s]', '', text) # Remove special char's & punct
    text = re.sub(r'\[|\]', '', text)  # Remove square brackets
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = remove_emojis(text) # Apply emoji function from above
    return text

data['cleaned_statement'] = data['statement'].apply(lambda x: clean_text(x))

In [42]:
data.head()

Unnamed: 0,statement,status,perc_upper_words,neg_emoji,exclamation_count,question_count,cleaned_statement
0,oh my gosh,Anxiety,0.0,0,0,0,oh my gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,0.0,0,0,0,trouble sleeping confused mind restless heart ...
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,0.0,0,0,0,all wrong back off dear forward doubt stay in ...
3,I have shifted my focus to something else but ...,Anxiety,0.0,0,0,0,i have shifted my focus to something else but ...
4,"I am restless and restless, it is been a month...",Anxiety,0.0,0,0,1,i am restless and restless it is been a month ...


## Tokenization, Stop Words, Lemmatization

Let's tokenize our data, remove stopwords, then use lemmatization. 

### 'Not'

Once we went ahead with tokenization, removal of stopwords, lemmatizaion, and printing a frequency distribution...some of the most-frequent tokens were words such as 'like', 'want' and 'know'...the meaning and interpretation of these words can be drastically altered depending on whether or not the word 'not' comes before them.

Let's alter our code to make sure **'not'** is *excluded* from the stop words we remove. Then we can create ***bigrams*** such as ('not', 'like') and ('not', 'want') that might help us contextualize our tokens more specifically. We'll wait to revisit our bigrams later on.

In [43]:
# Creating list of stop words
stopwords_list = stopwords.words('english')

# Excluding 'not'
important_words = ['not']

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define function to tokenize, remove stopwords, and lemmatize
def tokenize_stopwords_lemmatize(text):
    tokens = word_tokenize(text)
    cleaned_tokens = [w for w in tokens if w not in stopwords_list or w in important_words]
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in cleaned_tokens]
    return lemmatized_tokens

# Tokenize
data['tokens'] = data['cleaned_statement'].apply(tokenize_stopwords_lemmatize)

In [44]:
data.head()

Unnamed: 0,statement,status,perc_upper_words,neg_emoji,exclamation_count,question_count,cleaned_statement,tokens
0,oh my gosh,Anxiety,0.0,0,0,0,oh my gosh,"[oh, gosh]"
1,"trouble sleeping, confused mind, restless hear...",Anxiety,0.0,0,0,0,trouble sleeping confused mind restless heart ...,"[trouble, sleeping, confused, mind, restless, ..."
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,0.0,0,0,0,all wrong back off dear forward doubt stay in ...,"[wrong, back, dear, forward, doubt, stay, rest..."
3,I have shifted my focus to something else but ...,Anxiety,0.0,0,0,0,i have shifted my focus to something else but ...,"[shifted, focus, something, else, still, worried]"
4,"I am restless and restless, it is been a month...",Anxiety,0.0,0,0,1,i am restless and restless it is been a month ...,"[restless, restless, month, boy, mean]"


# Creating bigrams with 'not'

def bigrams_following_not(tokens):
    bigram_list = list(bigrams(tokens))
    filtered_bigrams = [bigram for bigram in bigram_list if bigram[0] == 'not']
    return filtered_bigrams

data['bigrams_following_not'] = data['tokens'].apply(bigrams_following_not)

# Combine all bigrams across DataFrame
all_bigrams = [bigram for bigram_list in data['bigrams_following_not'] for bigram in bigram_list]

# Remove 'not' from tokens, now that we've made bigrams
def remove_not(tokens):
    return [token for token in tokens if token != 'not']

# Apply function
data['tokens'] = data['tokens'].apply(remove_not)

## Frequency Distribution by 'Status' Group

Let's plot our frequency distributions for each 'status' group, to see which words are most commonly used for each personality disorder. 

We'll want to make use of common bigrams, to see how they compare to our single-word tokens with regards to frequency. We tried combining our tokens with common bigrams and plotting the frequency of both for each status...however our bigrams didn't seem to have as common frequency distribution, and therefore didn't appear on our graphs. 

We'll plot them separately for now, and keep the idea of combining their distributions in the back of our minds.

In [45]:
# Group by 'status' and aggregate tokens
status_groups = data.groupby('status')['tokens'].sum()

# Function to get the most common words for each group
def get_most_common_words(tokens, num_common=10):
    freqdist = FreqDist(tokens)
    return freqdist.most_common(num_common)

# Apply frequency distribution calculation for each status
most_common_words_by_status = status_groups.apply(lambda x: get_most_common_words(x))

#### Normalize Frequency Distribution Counts

Because our dataframe is so large, the code takes quite a while to run. Therefore we decided it's better to test things out, then go back and edit our previous code...to decrease the size and computational runttime of our notebook. 

We tried normalizing our frequency distributions prior to plotting, since we saw in one of the first iterations of this code that they cover a wide range of numbers. We used *log normalization*, since our values cover a wide range and will also be susceptible to outliers.

However, our graphs with log normalization were not very useful to interpret. It might actually be better to get a sense of *how* varied the distribution counts are among words first, *then* normalize if that's helpful.

We will **not** normalize our counts for now. 

# Plotting
# Set up figure and axes
fig, axes = plt.subplots(nrows=7, figsize=(12, 12))

# Empty dict to hold words that have already been plotted and their colors
plotted_words_and_colors = {}
# Establish color palette to pull from
color_palette = sns.color_palette('cividis', n_colors=38)

# Creating a plot for each unique status
status_groups = data.groupby('status')
for idx, (status, status_df) in enumerate(status_groups):
    # Combine all tokens in the current status group
    all_tokens_in_status = status_df['tokens'].explode()
    
    # Calculate frequency distribution
    freq_dist = FreqDist(all_tokens_in_status)
    
    # Get the top 10 tokens and their frequencies
    top_10 = freq_dist.most_common(10)
    tokens, counts = zip(*top_10)
    
    # Select appropriate colors, reusing colors if tokens repeat
    colors = []
    for token in tokens:
        if token not in plotted_words_and_colors:
            new_color = color_palette.pop(0)
            plotted_words_and_colors[token] = new_color
        colors.append(plotted_words_and_colors[token])
    
    # Select axes, plot data, set title
    ax = axes[idx]
    bars = ax.bar(tokens, counts, color=colors)
    ax.set_title(f"Status: {status}")
    
    # Rotate x-axis labels
    ax.set_xticks(tokens)
    ax.set_xticklabels(tokens, rotation=45, ha='right')

fig.tight_layout()
plt.show()

## Dropping Common Tokens

We need to filter our tokens further. We have a lot of overlap of frequent tokens among our status groups which makes sense...tokens such as 'like' are often used as colloquial "filler-words" with no substantial meaning, tokens such as 'get', 'would', and 'know' *could* be useful...but they're also common words used in every-day speech that have no meaning out of context, *especially* when they also appear across all of our status groups.

We need to find a way to identify common tokens specific to a singular status group, that will help us better predict features unique to that status group alone.

Let's define a **threshold** the filters out tokens appearing a certain number of times across all 'statement's in our dataframe. We'll start with 70%, and go from there.

# Combine all tokens across the DataFrame
all_tokens = [token for token_list in data['tokens'] for token in token_list]

# Calculate frequency distribution of all tokens
freq_dist = FreqDist(all_tokens)

# Set a frequency threshold of 70%
threshold = 0.7 * len(data)

# Identify common tokens
common_tokens = {token for token, count in freq_dist.items() if count > threshold}

# Function to remove common tokens
def remove_common_tokens(tokens):
    return [token for token in tokens if token not in common_tokens]

# Apply filtering to the tokens column
data['filtered_tokens'] = data['tokens'].apply(remove_common_tokens)

# Group by 'status' and aggregate filtered tokens
status_groups_filtered = data.groupby('status')['filtered_tokens'].sum()

# Function to get the most common words for each group
def get_most_common_words(tokens, num_common=10):
    freqdist = FreqDist(tokens)
    return freqdist.most_common(num_common)

# Apply frequency distribution calculation for each status
most_common_words_by_status_filtered = status_groups_filtered.apply(lambda x: get_most_common_words(x))

# Set up figure and axes
fig, axes = plt.subplots(nrows=7, figsize=(12, 12))

# Empty dict to hold words that have already been plotted and their colors
plotted_words_and_colors = {}
# Establish color palette to pull from
color_palette = sns.color_palette('cividis', n_colors=38)

# Creating a plot for each unique status
for idx, (status, words_list) in enumerate(most_common_words_by_status_filtered.items()):
    # Get the top words and their frequencies
    top_words = words_list
    words, counts = zip(*top_words)
    
    # Select appropriate colors, reusing colors if words repeat
    colors = []
    for word in words:
        if word not in plotted_words_and_colors:
            new_color = color_palette.pop(0)
            plotted_words_and_colors[word] = new_color
        colors.append(plotted_words_and_colors[word])
    
    # Select axes, plot data, set title
    ax = axes[idx]
    ax.bar(words, counts, color=colors)
    ax.set_title(f"Status: {status}")
    
    # Rotate x-axis labels
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right')

fig.tight_layout()
plt.show()

This did seem to help a bit. For example, 'pain' was added to our top 10 most-common words for users with 'Anxiety', which is far more useful than the word 'like'. 

However we still have a lot of overlap. Let's adjust our threshold further and try 60%. We can always go back and adjust.

# Set a frequency threshold of 60%
threshold = 0.6 * len(data)

# Identify common tokens
common_tokens = {token for token, count in freq_dist.items() if count > threshold}

# Function to remove common tokens
def remove_common_tokens(tokens):
    return [token for token in tokens if token not in common_tokens]

# Apply filtering to the tokens column
data['filtered_tokens'] = data['tokens'].apply(remove_common_tokens)

# Group by 'status' and aggregate filtered tokens
status_groups_filtered = data.groupby('status')['filtered_tokens'].sum()

# Function to get the most common words for each group
def get_most_common_words(tokens, num_common=10):
    freqdist = FreqDist(tokens)
    return freqdist.most_common(num_common)

# Apply frequency distribution calculation for each status
most_common_words_by_status_filtered = status_groups_filtered.apply(lambda x: get_most_common_words(x))

# Set up figure and axes
fig, axes = plt.subplots(nrows=7, figsize=(12, 12))

# Empty dict to hold words that have already been plotted and their colors
plotted_words_and_colors = {}
# Establish color palette to pull from
color_palette = sns.color_palette('cividis', n_colors=38)

# Creating a plot for each unique status
for idx, (status, words_list) in enumerate(most_common_words_by_status_filtered.items()):
    # Get the top words and their frequencies
    top_words = words_list
    words, counts = zip(*top_words)
    
    # Select appropriate colors, reusing colors if words repeat
    colors = []
    for word in words:
        if word not in plotted_words_and_colors:
            new_color = color_palette.pop(0)
            plotted_words_and_colors[word] = new_color
        colors.append(plotted_words_and_colors[word])
    
    # Select axes, plot data, set title
    ax = axes[idx]
    ax.bar(words, counts, color=colors)
    ax.set_title(f"Status: {status}")
    
    # Rotate x-axis labels
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right')

fig.tight_layout()
plt.show()

Super helpful! Notice how 'anymore' was added to 'Suicidal', and 'friend' was added to 'Personality disorder'.

Let's try it one more time, and go quite extreme. We tried filtering again to 50%, and it didn't make much of a difference. Let's set our threshold at 35% as an experiment and see what happens.

# Set a frequency threshold of 40%
threshold = 0.35 * len(data)

# Identify common tokens
common_tokens = {token for token, count in freq_dist.items() if count > threshold}

# Function to remove common tokens
def remove_common_tokens(tokens):
    return [token for token in tokens if token not in common_tokens]

# Apply filtering to the tokens column
data['filtered_tokens'] = data['tokens'].apply(remove_common_tokens)

# Group by 'status' and aggregate filtered tokens
status_groups_filtered = data.groupby('status')['filtered_tokens'].sum()

# Function to get the most common words for each group
def get_most_common_words(tokens, num_common=10):
    freqdist = FreqDist(tokens)
    return freqdist.most_common(num_common)

# Apply frequency distribution calculation for each status
most_common_words_by_status_filtered = status_groups_filtered.apply(lambda x: get_most_common_words(x))

# Set up figure and axes
fig, axes = plt.subplots(nrows=7, figsize=(12, 12))

# Empty dict to hold words that have already been plotted and their colors
plotted_words_and_colors = {}
# Establish color palette to pull from
color_palette = sns.color_palette('cividis', n_colors=38)

# Creating a plot for each unique status
for idx, (status, words_list) in enumerate(most_common_words_by_status_filtered.items()):
    # Get the top words and their frequencies
    top_words = words_list
    words, counts = zip(*top_words)
    
    # Select appropriate colors, reusing colors if words repeat
    colors = []
    for word in words:
        if word not in plotted_words_and_colors:
            new_color = color_palette.pop(0)
            plotted_words_and_colors[word] = new_color
        colors.append(plotted_words_and_colors[word])
    
    # Select axes, plot data, set title
    ax = axes[idx]
    ax.bar(words, counts, color=colors)
    ax.set_title(f"Status: {status}")
    
    # Rotate x-axis labels
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right')

fig.tight_layout()
plt.show()

Super interesting and potentially very useful!

## Bigrams

Let's revisit bigrams. We kept the stop word 'not' in our tokens, in order to see whether it's particularly useful in defining the sentiment of a following word in a bigram. 

We can furthermore calculate the **Raw Frequency** of bigrams with other tokens in our dataset as well as the **Pointwise Mutual Information Score** between bigram pairs in our dataset. Mutual Information Score essentially tells us the mutual dependence between two words. 

We can then determine whether any other bigram pairs are important. 

We'll perform this on our original 'tokens' column, rather than a list of tokens filtered with a threshold. There may be some very common words that become unique to a particular status, when paired with another token in a bigram!

### Filtering out Noise

We ran the bigram codes on our 'status' groups and the results were very interesting! We then realized it might be helpful to filter out pairs of words that occur frequently across our entire dataset, regardless of 'status' group. The same way we eliminate stop words from our tokens prior to drawing any meaningful conclusions.

By first filtering out the most frequent bigrams in our dataset, we can help eliminate unecessary noise.

### 'Not' and 'Like'

To improve computational efficiency, we ran the below codes and have some interesting takeaways that we can apply here prior to re-running...which will make our results more useful and improve run-times. 

Our main takeaways are regarding the words **not** and **like**. 

#### Like

As we mentioned earlier, **like** is a pretty useless filler word that's only important if it's conveying some type of sentiment in it's verb form ("to like"). Once we looked at the most common bigrams, we found this was *not* the case in our dataset 

Ex. If the bigrams had included phrases such as 'i', 'like' or 'not', 'like' or 'you', 'like' in relation to specific 'status' groups, they might have been useful in predicting a person's mental state). These pairings did **not** appear in our results, so we can safely *drop the token 'like' from our entire dataframe*. 

#### Not

In [46]:
# Concatenate all tokens across the dataset
all_tokens = sum(data['tokens'], [])

# Create BigramCollocationFinder
finder = BigramCollocationFinder.from_words(all_tokens)

# Score bigrams by raw frequency
bigrams_scored = finder.score_ngrams(BigramAssocMeasures.raw_freq)

# Sort bigrams by raw frequency (descending)
bigrams_scored_sorted = sorted(bigrams_scored, key=lambda x: x[1], reverse=True)

# Print the top 50 bigrams with their frequencies
print("Top 50 Bigrams by Raw Frequency in Entire Dataset:")
for idx, (bigram, freq) in enumerate(bigrams_scored_sorted[:50], 1):
    print(f"{idx}. Bigram: {bigram}, Frequency: {freq}")

Top 50 Bigrams by Raw Frequency in Entire Dataset:
1. Bigram: ('feel', 'like'), Frequency: 0.004967029525101682
2. Bigram: ('not', 'know'), Frequency: 0.0039532194211833235
3. Bigram: ('not', 'want'), Frequency: 0.003012819257949732
4. Bigram: ('not', 'even'), Frequency: 0.0020159499095740115
5. Bigram: ('not', 'take'), Frequency: 0.0009836932051758903
6. Bigram: ('anymore', 'not'), Frequency: 0.0009772933641931155
7. Bigram: ('not', 'feel'), Frequency: 0.0009163066442396161
8. Bigram: ('could', 'not'), Frequency: 0.0008963541988227304
9. Bigram: ('like', 'not'), Frequency: 0.0008937189701827644
10. Bigram: ('not', 'really'), Frequency: 0.0008515553119433079
11. Bigram: ('not', 'get'), Frequency: 0.0008428967035548482
12. Bigram: ('not', 'think'), Frequency: 0.0008346145564006692
13. Bigram: ('want', 'die'), Frequency: 0.0008203090294979965
14. Bigram: ('take', 'anymore'), Frequency: 0.0007197938799450065
15. Bigram: ('know', 'not'), Frequency: 0.0007186644962421639
16. Bigram: ('would

Interesting. Let's compare this to the status breakdowns.

#### Raw Frequency by Status

In [47]:
# Define a function to score bigrams
def score_bigrams(tokens):
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams_scored = bigram_finder.score_ngrams(BigramAssocMeasures.raw_freq)
    return bigrams_scored[:50]  # Return top 30 scored bigrams

# Group by 'status' and apply the scoring function
bigrams_by_status = data.groupby('status')['tokens'].apply(lambda x: score_bigrams(x.sum()))

# Print or access the results for each status
for status, bigrams in bigrams_by_status.items():
    print(f"Status: {status}")
    for idx, (bigram, score) in enumerate(bigrams, 1):
        print(f"{idx}. {bigram}: {score}")
    print()

Status: Anxiety
1. ('feel', 'like'): 0.0036235668431582077
2. ('not', 'know'): 0.002928581184769771
3. ('health', 'anxiety'): 0.002715666503298169
4. ('panic', 'attack'): 0.001787679872733262
5. ('anyone', 'else'): 0.00134176421908519
6. ('not', 'want'): 0.0011087632469087198
7. ('not', 'even'): 0.0010163663096663265
8. ('could', 'not'): 0.0009440556631288013
9. ('not', 'really'): 0.0009038830817190651
10. ('year', 'old'): 0.0008155034026176454
11. ('not', 'sure'): 0.000767296304925962
12. ('year', 'ago'): 0.000767296304925962
13. ('month', 'ago'): 0.0007512272723620674
14. ('even', 'though'): 0.0007472100142210938
15. ('heart', 'attack'): 0.0007391754979391465
16. ('came', 'back'): 0.0007070374328113576
17. ('not', 'think'): 0.0007070374328113576
18. ('lymph', 'node'): 0.0006748993676835686
19. ('heart', 'rate'): 0.000670882109542595
20. ('week', 'ago'): 0.000670882109542595
21. ('not', 'get'): 0.0006548130769787005
22. ('chest', 'pain'): 0.0006347267862738323
23. ('not', 'feel'): 0.0

#### Mutual Information Score by Status

In [48]:
bigram_measures = BigramAssocMeasures()

# Define a function to score bigrams using PMI
def score_bigrams(tokens):
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_finder.apply_freq_filter(20)  # Higher filter for larger dataframe
    bigrams_scored = bigram_finder.score_ngrams(bigram_measures.pmi)
    return bigrams_scored

# Group by 'status' and apply the scoring function
bigrams_by_status = data.groupby('status')['tokens'].apply(lambda x: score_bigrams(sum(x, [])))

# Print or access the results for each status
for status, bigrams in bigrams_by_status.items():
    print(f"Status: {status}")
    for idx, (bigram, score) in enumerate(bigrams[:50], 1):  # Print top 30
        print(f"{idx}. {bigram}: {score}")
    print()

Status: Anxiety
1. ('bowel', 'movement'): 11.64690794204374
2. ('acid', 'reflux'): 11.383292857980777
3. ('greatly', 'appreciated'): 11.069747309599396
4. ('ct', 'scan'): 10.463305081467785
5. ('shortness', 'breath'): 10.29414726698367
6. ('lymph', 'node'): 10.266550869023526
7. ('birth', 'control'): 10.233479695626553
8. ('urgent', 'care'): 10.206260796088028
9. ('full', 'blown'): 10.163806167819741
10. ('falling', 'asleep'): 10.106339560681036
11. ('fall', 'asleep'): 9.898665282639225
12. ('oh', 'god'): 9.871554905095465
13. ('yo', 'male'): 9.547062544352329
14. ('family', 'member'): 9.426031434517974
15. ('fast', 'forward'): 9.41807426336494
16. ('laying', 'bed'): 9.090130814760826
17. ('swollen', 'lymph'): 9.033573697045911
18. ('weight', 'loss'): 8.640648951572057
19. ('brain', 'fog'): 8.596623429573771
20. ('deep', 'breath'): 8.483243277844995
21. ('story', 'short'): 8.452869628801476
22. ('old', 'male'): 8.42153166226847
23. ('side', 'effect'): 8.389272112298457
24. ('burning', 

There is a lot of really fascinating and informative findings in our exploration of bigrams. 

For example, 'Anxiety' users appear to speak about physical health ailments and concerns (such as brain tumors, lymph nodes, heart attacks), which makes a lot of sense. 'Bipolar' users speak in 'up & down', 'high & low', 'mood swings'...which also makes sense. 'Depression' users conjure "downward" imagery of 'rock bottom', 'rabbit hole', etc. 

This is all super helpful!

# Combining tokens and bigrams
def combine_tokens_bigrams(row):
    tokens = row['tokens']
    bigrams = row['bigrams_following_not']
    return tokens + bigrams

data['combined_tokens_bigrams'] = data.apply(combine_tokens_bigrams, axis=1)

# Group by status and aggregate
status_groups = data.groupby('status')['combined_tokens_bigrams'].sum()