In [3]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/combined.csv')

In [5]:
df.drop(['Unnamed: 0', 'score'], axis=1, inplace=True)

In [6]:
df

Unnamed: 0,id,title,text,label
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0
1,149j9jo,omfg guys I'm actually so happy.. my mood has ...,,0
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0
...,...,...,...,...
5938,en197s,I am 1 year self-harm free!,"As of today, I am one year self-harm free! Tha...",1
5939,du3c70,I drew myself and how I feel recently. Its har...,,1
5940,bdlqts,🖤,,1
5941,asvm6s,Who can relate,,1


In [7]:
df[df['text'].notnull()]['label'].value_counts()

0    1759
1    1755
Name: label, dtype: int64

In [8]:
df = df[df['text'].notnull()]

In [9]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['combined'] = df['title'] + " " + df['text']

In [10]:
df

Unnamed: 0,id,title,text,label,combined
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...
...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...


In [11]:
#preprocessing the combined text

def preprocesstext(text):
    text = text.lower() #convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text) #remove symbols
    tokens = word_tokenize(text)
    stopword = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stopword]
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [12]:
def listofwords(text):
    return text.split()

In [13]:
df['processed_combined'] = df['combined'].apply(preprocesstext)

In [14]:
df['list_words'] = df['processed_combined'].apply(listofwords)

In [15]:
df

Unnamed: 0,id,title,text,label,combined,processed_combined,list_words
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...,mania coping skill many people experience incr...,"[mania, coping, skill, many, people, experienc..."
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...,telltale sign going hypo manic episode noticed...,"[telltale, sign, going, hypo, manic, episode, ..."
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...,cope go manic depressed overnight manic two we...,"[cope, go, manic, depressed, overnight, manic,..."
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...,quit job last week episode regretting school t...,"[quit, job, last, week, episode, regretting, s..."
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...,relationship anyone feel like lone forever try...,"[relationship, anyone, feel, like, lone, forev..."
...,...,...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...,think safe space used love browsing subreddit ...,"[think, safe, space, used, love, browsing, sub..."
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...,struggled lot weight year especially med final...,"[struggled, lot, weight, year, especially, med..."
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...,amount stuff see annoying make anyone else unc...,"[amount, stuff, see, annoying, make, anyone, e..."
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...,schizophrenia specia scam need mod see person ...,"[schizophrenia, specia, scam, need, mod, see, ..."


In [16]:
df['subreddit'] = df['label'].replace({0: 'bipolar', 1: 'schizophrenia'})


In [17]:
df_bipolar = df[df['subreddit'] == 'bipolar']
df_schizophrenia = df[df['subreddit'] == 'schizophrenia']


In [18]:
df_bipolar

Unnamed: 0,id,title,text,label,combined,processed_combined,list_words,subreddit
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...,mania coping skill many people experience incr...,"[mania, coping, skill, many, people, experienc...",bipolar
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...,telltale sign going hypo manic episode noticed...,"[telltale, sign, going, hypo, manic, episode, ...",bipolar
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...,cope go manic depressed overnight manic two we...,"[cope, go, manic, depressed, overnight, manic,...",bipolar
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...,quit job last week episode regretting school t...,"[quit, job, last, week, episode, regretting, s...",bipolar
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...,relationship anyone feel like lone forever try...,"[relationship, anyone, feel, like, lone, forev...",bipolar
...,...,...,...,...,...,...,...,...
2886,dbyed4,I did a thing!,"I threw out my ""just in case pills."" It was a ...",0,"I did a thing! I threw out my ""just in case pi...",thing threw case pill huge bottle leftover pil...,"[thing, threw, case, pill, huge, bottle, lefto...",bipolar
2916,k4gor5,(hypo)mania and depression aren't just intense...,I started Lamictal a couple months ago and thi...,0,(hypo)mania and depression aren't just intense...,hypo mania depression intense version happines...,"[hypo, mania, depression, intense, version, ha...",bipolar
2919,ig43dn,i fucking got in to grad school guys,i was cycling really hard my entire undergradu...,0,i fucking got in to grad school guys i was cyc...,fucking got grad school guy cycling really har...,"[fucking, got, grad, school, guy, cycling, rea...",bipolar
2962,lb1ouw,My semester GPA is 4.00!,My hands are shaking and I am in complete disb...,0,My semester GPA is 4.00! My hands are shaking ...,semester gpa hand shaking complete disbelief d...,"[semester, gpa, hand, shaking, complete, disbe...",bipolar


In [19]:
X = df['processed_combined']
y = df['subreddit']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [21]:
tfid = TfidfVectorizer(min_df=0.005, max_df=0.90)

In [22]:
X_train_tfid = tfid.fit_transform(X_train)

In [23]:
X_test_tfid = tfid.transform(X_test)

In [24]:
X_train_df = pd.DataFrame(X_train_tfid.toarray(),
                          columns=tfid.get_feature_names_out())
X_train_df

Unnamed: 0,abandoned,abilify,ability,able,absolute,absolutely,abuse,abusive,accept,accepted,...,yes,yesterday,yet,young,younger,youtu,youtube,yr,zero,zombie
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2455,0.0,0.0,0.0,0.059028,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2456,0.0,0.0,0.0,0.068726,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119108,0.0,0.0
2457,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Define the models
model_lr = LogisticRegression()
model_nb = MultinomialNB()
model_svc = SVC()
model_rf = RandomForestClassifier()
model_dt = DecisionTreeClassifier()
model_knn = KNeighborsClassifier(n_neighbors=25, weights = 'distance')
model_gb = GradientBoostingClassifier()
model_ab = AdaBoostClassifier()

# List of models
models = [model_lr, model_nb, model_svc, model_rf, model_dt, model_knn, model_gb, model_ab]

# VotingClassifier for ensemble learning
model_vc = VotingClassifier(estimators=[('lr', model_lr), ('nb', model_nb), ('svc', model_svc), 
                                        ('rf', model_rf), ('dt', model_dt), ('knn', model_knn), 
                                        ('gb', model_gb), ('ab', model_ab)], voting='hard')
models.append(model_vc)

# Loop over the models
for model in models:
    model.fit(X_train_tfid, y_train)
    y_pred = model.predict(X_test_tfid)
    print(f'Model: {model.__class__.__name__}, Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Model: {model.__class__.__name__}, Train score is {model.score(X_train_tfid, y_train)}, Test score is {model.score(X_test_tfid, y_test)}')
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, columns = ['Pred B', 'Pred S'], index = ['Actual B','Actual S'])
    display(cm_df)
    print()

Model: LogisticRegression, Accuracy: 0.8663507109004739
Model: LogisticRegression, Train score is 0.9263928426189508, Test score is 0.8663507109004739


Unnamed: 0,Pred B,Pred S
Actual B,463,66
Actual S,75,451



Model: MultinomialNB, Accuracy: 0.8483412322274881
Model: MultinomialNB, Train score is 0.8869459129727532, Test score is 0.8483412322274881


Unnamed: 0,Pred B,Pred S
Actual B,474,55
Actual S,105,421



Model: SVC, Accuracy: 0.8597156398104265
Model: SVC, Train score is 0.9955266368442456, Test score is 0.8597156398104265


Unnamed: 0,Pred B,Pred S
Actual B,456,73
Actual S,75,451



Model: RandomForestClassifier, Accuracy: 0.8691943127962085
Model: RandomForestClassifier, Train score is 0.9987799918666125, Test score is 0.8691943127962085


Unnamed: 0,Pred B,Pred S
Actual B,473,56
Actual S,82,444



Model: DecisionTreeClassifier, Accuracy: 0.8123222748815165
Model: DecisionTreeClassifier, Train score is 0.9987799918666125, Test score is 0.8123222748815165


Unnamed: 0,Pred B,Pred S
Actual B,425,104
Actual S,94,432



Model: KNeighborsClassifier, Accuracy: 0.795260663507109
Model: KNeighborsClassifier, Train score is 0.9987799918666125, Test score is 0.795260663507109


Unnamed: 0,Pred B,Pred S
Actual B,395,134
Actual S,82,444



Model: GradientBoostingClassifier, Accuracy: 0.8559241706161137
Model: GradientBoostingClassifier, Train score is 0.9263928426189508, Test score is 0.8559241706161137


Unnamed: 0,Pred B,Pred S
Actual B,442,87
Actual S,65,461



Model: AdaBoostClassifier, Accuracy: 0.8464454976303317
Model: AdaBoostClassifier, Train score is 0.8906059373729158, Test score is 0.8464454976303317


Unnamed: 0,Pred B,Pred S
Actual B,448,81
Actual S,81,445



Model: VotingClassifier, Accuracy: 0.8701421800947867
Model: VotingClassifier, Train score is 0.989019926799512, Test score is 0.8701421800947867


Unnamed: 0,Pred B,Pred S
Actual B,473,56
Actual S,81,445





In [30]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define the hyperparameter grid for each model
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Grid search example
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=5)
grid_search.fit(X_train_tfid, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best parameters:", best_params)
print("Best score:", best_score)

# Random search example
random_search = RandomizedSearchCV(estimator=model_rf, param_distributions=param_grid_rf, n_iter=10, cv=5)
random_search.fit(X_train_tfid, y_train)

best_params = random_search.best_params_
best_score = random_search.best_score_
print("Best parameters:", best_params)
print("Best score:", best_score)

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.8495363701091186
Best parameters: {'n_estimators': 300, 'min_samples_split': 5, 'max_depth': 10}
Best score: 0.844655009686553


---
Testing
--- 

In [26]:
#I randomly chose a post from r/bipolar and prepped it for prediction

title = 'Is 4 months in the psych ward a long time?'
post = 'I keep hearing how people only stay for 3 days or a week. I just feel like a freak. 4 months is a long time and I just wonder why so long? I don\’t know anyone who has been there that long…people who freak out just stay for a tiny bit and they are free to go. I just don’t get it. Was I really that crazy? I just feel like I don\’t fit in society…'

In [27]:
series = pd.Series(preprocesstext(title + post))

In [28]:
tfid.transform(series)

<1x1794 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [29]:
model_vc.predict(tfid.transform(series))

array(['bipolar'], dtype=object)