In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import classification_report, silhouette_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

from scipy.stats import chi2_contingency, multinomial

import nltk
import re
nltk.download('punkt')
from nltk.tokenize import word_tokenize, RegexpTokenizer, regexp_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, LancasterStemmer, PorterStemmer
from nltk.probability import FreqDist
from nltk import WordNetLemmatizer, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
#nltk.download('all')

import itertools
import string

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from copy import deepcopy

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmascotson/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emmascotson/nltk_data...


# Loading the Data

Let's load our dataset and take a look at it.

In [46]:
data = pd.read_csv('data/Combined Data.csv')

In [47]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


Looks like we have some NaNs. Let's take a closer look.

In [49]:
data[pd.isna(data['statement'])]

Unnamed: 0.1,Unnamed: 0,statement,status
293,293,,Anxiety
572,572,,Anxiety
595,595,,Anxiety
1539,1539,,Normal
2448,2448,,Normal
...,...,...,...
52838,52838,,Anxiety
52870,52870,,Anxiety
52936,52936,,Anxiety
53010,53010,,Anxiety


There are so few NaN's compared to the number of rows in our entire dataframe. Furthermore, rows with no 'statement' value are useless to us. Let's drop them.

In [50]:
data = data.dropna(subset=['statement'])

Now let's check if there are duplicate rows. 

In [51]:
data['statement'].nunique()

51073

It looks like there are!

In [52]:
data[data['statement'].duplicated()]

Unnamed: 0.1,Unnamed: 0,statement,status
97,97,"""No regrets or grudges/angry at things that ha...",Anxiety
138,138,but my heart is still restless even though my ...,Anxiety
167,167,I want to exhale the restlessness in my chest ...,Anxiety
228,228,Do not compare yourself to others. Envy only m...,Anxiety
244,244,"people seem calm, happy like there's no proble...",Anxiety
...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


These statements all look pretty specific and personalized. AKA it does seem like our duplicate values are true duplicates of the same social media statements made by the same singular user. Furthermore, we have over 50,000 rows in our dataset and can afford to lose these rows. 

Let's drop duplicates as well.

In [53]:
# Removing 'statement' duplicates, keeping first instance
data = data.drop_duplicates(subset=['statement'], keep='first')

In [54]:
data[data['statement'].duplicated()]

Unnamed: 0.1,Unnamed: 0,statement,status


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51073 entries, 0 to 52840
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  51073 non-null  int64 
 1   statement   51073 non-null  object
 2   status      51073 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


Furthermore, our 'Unnamed: 0' column appears to just be a duplicate of our index. Let's explore. 

In [56]:
data['Unnamed: 0'].nunique()

51073

In [57]:
# Checking if 'Unnamed: 0' equals our index
is_identical = data['Unnamed: 0'].equals(data.index)
is_identical

False

Hm...We can safely assume this column doesn't provide information about multiple 'statement' values being generated by the same singular user...because our nunique() matches the total number of rows in our dataframe, which eliminates the possibility of duplicates. 

This column is most likely just an old index from previous datasets, which is no longer useful to use. We'll drop it for now. We can always go back and edit the code if we decide we need it later.

In [58]:
data.drop(columns=['Unnamed: 0'], inplace=True)

Let's look at the value_counts() for our target variable 'status'.

In [59]:
data['status'].value_counts()

status
Normal                  16039
Depression              15087
Suicidal                10641
Anxiety                  3617
Bipolar                  2501
Stress                   2293
Personality disorder      895
Name: count, dtype: int64

#### 'Personality disorder'

'Personality disorder' is an ambiguous label compared to the others. It might be a version of a 'placeholder' value for not 'Normal' users where further categorization of disorder (ex. 'Depression', 'Suicidal', 'Anxiety') was inconclusive. Alternatively, it could be indication of other, entirely different personality disorders not listed above. 

We'll keep this in mind and explore as we go. Depending on what we find, it might be helpful to drop these values when building a predictive model.

# Preprocessing

## Text Cleaning

Let's clean our data by performing operations that remove punctuation and special characters, lowercase text, remove newline '\n' characters, etc.

#### Percentage Upper-Case

Before we get to cleaning, let's think about whether or not any unique punctuation and/or text characteristics might be indicative of a personality disorder. 

*Upper-case* text might help identify whether or not a person is in some kind of distress. If user posts 'A MESSAGE ENTIRELY IN UPPER-CASE LIKE SO', that's an unusual behavior that we should try and quantify.

Let's create a new column 'perc_upper' that calculates the percentage of upper-case letters to the total number of letters in each 'statement'.

In [60]:
# Define function to calculate percentage of upper-case letters
def perc_upper(text):
    if not isinstance(text, str):
        return 0
    letters = re.findall(r'[a-zA-Z]', text) # Using regex to isolate letters
    if not letters: 
        return 0
    upper_count = len(re.findall(r'[A-Z]', text))
    return (upper_count / len(letters)) * 100

data['perc_upper'] = data['statement'].apply(perc_upper)

In [61]:
upper_sorted = data.sort_values(by='perc_upper', ascending=False)
upper_sorted.head()

Unnamed: 0,statement,status,perc_upper
3546,[HELP RT] WE FANBASE SHAKE RP! JOIN? FOLLOW FI...,Normal,100.0
2436,ONAKA GA SUITA,Normal,100.0
6787,TODAY NO CLASS YAAYYY,Normal,100.0
5303,6 HALF HOURS AGAINIII ULULU I ​​WANT TO SLEEP ...,Normal,100.0
2478,I DON'T HAVE A HOLIDAY AS WELL AS EVIL,Normal,100.0


#### METRIC CHANGE - Percentage by Upper-Case Words

On second thought, we should find a different way to quantify a notable amount of upper-case in a string of text. With our current 'perc_upper' value...a statement such as 'Hi' will have a 50% perc_upper. However this would be due to normal grammatical capitalization techniques that are of no note.

Let's alter our metric slightly, to calculate the **percentage of upper-case words compared to the total number of words in a statement**.

We'll stick with regex for now to tokenize our words and calculate this metric.

In [62]:
# Dropping perc_upper column before to reduce computation time
data.drop(columns=['perc_upper'], inplace=True)

# Updating perc_upper to calculate based on number of words
def perc_upper_words(text): 
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b[a-zA-Z\'-]+\b', text) # Including apostrophes and hyphens
    words = [word for word in words if not re.search(r'\d', word)] # Filter out numbers
    if not words:
        return 0
    uppercase_words = [word for word in words if word.isupper()]
    return len(uppercase_words) / len(words) * 100

data['perc_upper_words'] = data['statement'].apply(perc_upper_words)

In [63]:
upper_sorted = data.sort_values(by='perc_upper_words', ascending=False)
upper_sorted.head()

Unnamed: 0,statement,status,perc_upper_words
11210,I KEEP MESSING THINGS UP ALL DAY EVERY DAY THE...,Suicidal,100.0
5917,#PECAT WHOSE NAME IS SI ALI MOCHTAR NYEBELIN #,Normal,100.0
4173,OH MY GOD FEAR CANCEL,Normal,100.0
1581,HIS BD CARD KANON IS CUTE HSHSHSHS SAD NO FREE...,Normal,100.0
2811,YAALLAH SO NATION OF SM DREAMIES,Normal,100.0


In [64]:
# Sampling rows from the middle
filtered_rows = upper_sorted[(upper_sorted['perc_upper_words'] >= 50.0) & (upper_sorted['perc_upper_words'] <= 60.0)]
filtered_rows.head(20)

Unnamed: 0,statement,status,perc_upper_words
10953,NOBODY WANTS TO TALK TO ME Going to kill myself,Depression,60.0
498,"the habit of chatting for a day, gadicchat imm...",Anxiety,60.0
17851,I feel like I cannot even explain myself becau...,Suicidal,60.0
4525,NOT FUNNY MORNING â€ Runny nose,Normal,60.0
24841,I try my best but I just want to die. I hate m...,Depression,59.42029
6357,HAHAAAA GET THE ASSIGNMENT OF MAKING A SCIENTI...,Normal,57.894737
1113,WANT TO SIN BUT Afraid to Laugh,Normal,57.142857
36232,Everything is temporary. EVIL EYES OFF SHIVRIT,Normal,57.142857
830,MORNING AGAIN I've made a AMENDE SPACE,Normal,57.142857
48673,AAAAAAAAAAAAAAAAAA aaAAAAAAAAAA FIDBWJSKLA oaa...,Stress,57.142857


Let's see if there are any trends regarding the distribution of these numbers within each of our 'status' values.

In [65]:
upper_stats = data.groupby('status')['perc_upper_words'].agg(['mean', 'min', 'max'])
upper_stats

Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,5.511061,0.0,100.0
Bipolar,5.978278,0.0,25.0
Depression,5.66738,0.0,100.0
Normal,3.903474,0.0,100.0
Personality disorder,5.424887,0.0,27.118644
Stress,5.664143,0.0,57.142857
Suicidal,7.922907,0.0,100.0


Interesting. 'Normal' has the lowest average perc_upper_words with a mean of 3.90, whereas 'Suicidal' has the highest with a mean of 7.92.

Let's see if there are any noticeable trends for the same metric applied to lower-case words. A higher percentage of lower-case words might be indicative of disorders like 'Depression'.

In [66]:
# Updating perc_upper to calculate based on number of words
def perc_lower_words(text): 
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b[a-zA-Z\'-]+\b', text) # Including apostrophes and hyphens
    words = [word for word in words if not re.search(r'\d', word)] # Filter out numbers
    if not words:
        return 0
    lowercase_words = [word for word in words if word.islower()]
    return len(lowercase_words) / len(words) * 100

data['perc_lower_words'] = data['statement'].apply(perc_lower_words)

In [67]:
lower_stats = data.groupby('status')['perc_lower_words'].agg(['mean', 'min', 'max'])
lower_stats

Unnamed: 0_level_0,mean,min,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety,87.107075,0.0,100.0
Bipolar,86.984639,25.0,100.0
Depression,90.887056,0.0,100.0
Normal,89.098266,0.0,100.0
Personality disorder,87.738038,45.454545,100.0
Stress,87.853044,14.285714,100.0
Suicidal,87.43036,0.0,100.0


'Depression' does indeed have the highest mean compared with our other 'status' values. 

However, typing in all lower-case is far more socially "standard" than typing in caps-lock. People often remove automatic capitilization as a setting on their phone or computer, which sets their default text to all lower-case. 

Given that lower-case typing is a societal 'norm', and a personal preference people often make regardless of psychological state...the 'lower_stats' numbers above do not vary drastically between 'status' groups enough for us to be able to use this as a predictive metric. 

We'll drop it and just keep perc_upper_words. 

In [68]:
data.drop(columns=['perc_lower_words'], inplace=True)

#### Special Characters

Let's explore if it will be useful to apply similar logic to special characters and punctuation. Let's examine all the different special characters in our entire dataframe, excluding numbers.

In [69]:
# Searching for special characters, excluding numbers
def extract_special(text):
    if not isinstance(text, str):
        return ''
    return re.findall(r'[^\w\s\d]', text)

special_chars = data['statement'].apply(extract_special).explode().unique()

In [70]:
special_chars

array([nan, ',', '.', "'", '?', ':', ')', '(', '[', ']', '™', '‚', '=',
       '"', '/', '#', '!', '️', '±', '-', '…', '*', '”', '&', '€', '„',
       '«', '–', '¤', '<', '°', '»', '^', ';', '\u200b', '“', '+', '˜',
       '‹', '¥', '©', '£', '@', '~', '¸', '—', '$', '§', '•', '¡', '❤',
       '¨', '>', '‡', '|', '¯', '%', '¦', '{', '}', '’', '\\', '´', '`',
       '†', '¶', '·', '®', '¢', '‰', '¿', '¬', '›', '👩', '\u200d', '🎓',
       '😇', '\u200e', '➡', '💸', '😔', '🙂', '😦', '‘', '💕', '●', '🐰', '🥕',
       '💖', '😱', '😐', '✌', '🏻', '🙃', '☹', '😅', '😢', '😂', '😩', '😖', '↑',
       '👍', '🏼', '😭', '🤷', '♂', '⛅', '🛑', '🤦', '♀', '🙄', '😕', '😠', '😫',
       '😁', '\ufeff', '😒', '💩', '😜', '😨', '😆', '🏽', '😓', '\x80', '\x99',
       '\x9f', '\x98', '\xad', '\x92', '\x8f', '\x8b', '\x9c', '\x9d',
       '\x8d', '\x87', '\x82', '\x8a', '\x81', '\x8c', '\x91', '\x94',
       '\x84', '\x95', '\x96', '\x89', '\x83', '\x88', '\x97', '\x90',
       '\x9a', '\x8e', '\U0001fae0', '🙏', '🔃', '💀', '🤣', '\U0001f

#### Emojis

Let's try and categorize the emojis used. There are few enough that we can manually create lists identifying emojis we deem to carry clear 'positive' or 'negative' sentiments. We'll ignore any "neutral" emojis: we're about to get into more detailed and layered analysis of the actual text in a bit, so we don't need to waste time getting *too* intricate with these additional features!

If we have to choose one or the other, we'll probably choose **negative** emojis as our main predictive metric (with regards to emoji sentiments)...since positive emojis could be used sarcastically. 

The sarcastic use of positive "sentiments" to indicate negativity is far more common than the sarcastic use of negative sentiments to indicate positivity. 

##### Varying Cultural Expression

There can be differences in a person's interpretation and use of emojis when conveying sentiments, based on a variety of cultural factors. We don't know enough about the people who created this dataset, nor do we have demographic information on the users in the dataset itself, to make those determinations...We'll categorize emoji sentiments through our own cultural lens, and see how well our version of this metric helps or hurts our predictive model.

In [72]:
pos_emojis = ['😇', '💖', '🙂', '💕', '❤', '😅', '😂', '👍', '😁', '😆', '🤣', '🥰', '💜', '🤩', '🖤', '😌', '🥲', '🤍','💚','😊',
              '💗', '😍', '🧡', '💝', '♥', '💛', '🤗', '😀', '😹', '😏']

neg_emojis = ['😔', '😦', '😐', '😢', '😩', '😖', '😭', '🤦', '🙄', '😕', '😠', '😫', '😒', '😨', '😓', '😮', '😞', '😳', '😥',
              '😑', '🙁', '😣', '😪', '🤕', '💔']

##### Binary Classification

As we stated, we don't need to get too intricate. Let's create a binary classifier to simply determine whether a 'statement' has any number of positive or negative emojis in it's text. 

In [73]:
# 1 if any positive emojis in 'statement', else 0
def has_pos_emoji(text):
    if not isinstance(text, str):
        return 0
    return 1 if any(emoji in text for emoji in pos_emojis) else 0

data['pos_emoji'] = data['statement'].apply(has_pos_emoji)

In [74]:
# Same for negative emojis
def has_neg_emoji(text):
    if not isinstance(text, str):
        return 0
    return 1 if any(emoji in text for emoji in neg_emojis) else 0

data['neg_emoji'] = data['statement'].apply(has_neg_emoji)

In [76]:
# Define a function to count rows with certain values
def count_rows(df, pos_val, neg_val):
    filtered_df = df[(df['pos_emoji'] == pos_val) & (df['neg_emoji'] == neg_val)]
    return filtered_df.groupby('status').size()

print("Number of Rows with POSITIVE Emojis:")
print(count_rows(data, 1, 0))
print("")
print("Number of Rows with NEGATIVE Emojis:")
print(count_rows(data, 0, 1))
print("")
print("Number of Rows with BOTH:")
print(count_rows(data, 1, 1))

Number of Rows with POSITIVE Emojis:
status
Anxiety                 39
Bipolar                 17
Depression              10
Normal                  15
Personality disorder    22
Stress                   7
dtype: int64

Number of Rows with NEGATIVE Emojis:
status
Anxiety                 40
Bipolar                 12
Depression               8
Personality disorder    14
Stress                  13
dtype: int64

Number of Rows with BOTH:
status
Anxiety                 2
Depression              2
Personality disorder    4
dtype: int64


We were correct in assuming that the use of negative emojis might be a more telling metric than the use of positive emojis. There are no 'Normal' status users that have used any negative emojis whatsoever. The results for positive emojis are more ambiguous. Let's drop that column.

(There are a suprisingly few amount of rows in our dataset that contain emojis. Our neg_emoji column is probably useless as well, but we'll keep it for right now).

In [77]:
data.drop(columns=['pos_emoji'], inplace=True)

#### ! and ?

The last notable special characters we might want to document prior to text cleaning are exclamation points ('!') and question marks ('?'). These are frequently used to convey emotional sentiment, especially distress when used repeatedly (ex. '!!!!!', '????', '!?!?!?!?'). 

Let's add columns counting the number of times a person uses these in a 'statement'. 

We'll want to normalize our values to account for varying statement lengths, to be able to accurately identify any notable outliers.

In [78]:
# Function to count characters excluding whitespace and calculate proportion of 'char'
def calculate_char_ratio(text, char):
    if not isinstance(text, str):
        return 0

    # Remove whitespace and count non-whitespace characters
    non_whitespace_text = re.sub(r'\s+', '', text)
    total_characters = len(non_whitespace_text)
    
    # Count occurrences of 'char'
    char_count = text.count(char)
    
    # Calculate proportion
    if total_characters == 0:
        return 0
    ratio = char_count / total_characters
    
    return ratio

# Apply the function to calculate proportions for '!' and '?'
data['!_ratio'] = data['statement'].apply(lambda x: calculate_char_ratio(x, '!'))
data['?_proportion'] = data['statement'].apply(lambda x: calculate_char_ratio(x, '?'))

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [None]:
# Define function to clean text
def clean_text(text):
    text = text.lower() # Lowercase
    text