In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report
from tabpy.tabpy_tools.client import Client
client = Client('http://localhost:9004/')

In [2]:
df = pd.read_csv('../data/combined.csv')

In [3]:
df.drop(['Unnamed: 0', 'score'], axis=1, inplace=True)

In [4]:
df

Unnamed: 0,id,title,text,label
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0
1,149j9jo,omfg guys I'm actually so happy.. my mood has ...,,0
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0
...,...,...,...,...
5938,en197s,I am 1 year self-harm free!,"As of today, I am one year self-harm free! Tha...",1
5939,du3c70,I drew myself and how I feel recently. Its har...,,1
5940,bdlqts,🖤,,1
5941,asvm6s,Who can relate,,1


In [5]:
df[df['text'].notnull()]['label'].value_counts()

0    1759
1    1755
Name: label, dtype: int64

In [6]:
df = df[df['text'].notnull()]

In [7]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['combined'] = df['title'] + " " + df['text']

In [8]:
df

Unnamed: 0,id,title,text,label,combined
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...
...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...


In [9]:
#preprocessing the combined text

def preprocesstext(text):
    text = text.lower() #convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text) #remove symbols
    tokens = word_tokenize(text)
    stopword = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stopword]
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [10]:
def listofwords(text):
    return text.split()

In [11]:
df['processed_combined'] = df['combined'].apply(preprocesstext)

In [12]:
df

Unnamed: 0,id,title,text,label,combined,processed_combined
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,Mania Coping Skills Many people experience inc...,mania coping skill many people experience incr...
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,What are your telltale signs you’re going into...,telltale sign going hypo manic episode noticed...
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,How do you cope when you go from manic to depr...,cope go manic depressed overnight manic two we...
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,Quit my job last week during an episode. Regre...,quit job last week episode regretting school t...
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,Relationships? Does anyone feel like they will...,relationship anyone feel like lone forever try...
...,...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,I think this should be a safe space again. I u...,think safe space used love browsing subreddit ...
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,I did it! I struggled a lot with my weight ove...,struggled lot weight year especially med final...
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,the amount of this stuff I see is annoying. do...,amount stuff see annoying make anyone else unc...
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,Schizophrenia_specia is a scam I need the mods...,schizophrenia specia scam need mod see person ...


In [13]:
df['subreddit'] = df['label'].replace({0: 'bipolar', 1: 'schizophrenia'})


In [14]:
relevant_words_bipolar = ['hypomania','hypomanic','lamictal','manic','mania','lithium','depressive','mixed','husband','swing','mood','depressed','proud','exhaustive',
                         'productive','shift','manage','spending','stable','seroquel','depression','episode','sex','drinking','insurance','appointment','job','emotionally',
                         'ptsd','boyfriend','adhd','ready','phase','adderall','miss','hit','slept','physically','plan','sleeping','schedule','sad','hobby','diagnosed','cycle','spent',
                         'date','medicated','public','latuda','bought','card','new','excited','fuck','yall','morning','team','week','antidepressant','hospitalized','bc',
                         'spend','mistake','breakdown','staying','reaction','work','prescribed','moved','incredibly','relationship','wife','lose','seriously','current','broke',
                         'terrible','amazing','summer','anger','therapy','therapist','healthy','struggling','psych','career','ton','personally','suicidal','med','pay','service',
                         'appreciated','coping']

In [15]:
relevant_words_schizo = ['viewed','voice','auditory','hallucination','channel','delusion','evil','belief','neighbor','visual','hearing','dopamine','olanzapine','youtube',
                         'psychosis','demon','link','video','male','loud','hear','brief','delusional','vision','paranoia','imagine','heard','strange','paranoid','police',
                         'spiritual','noise','character','psychotic','believed','individual','brother','topic','negative','reality','leg','real','dad','passed','door','god',
                         'memory','study','woman','sensation','white','wall','men','soul','view','connection','believe','information','head','name','walking','street',
                         'future','kill','touch','eye','father','research','sympton','cat','listen','movie','man','watch','dont','recovery','power','specific','conversation',
                         'mentioned','scary','violent','red','laugh','joke','abiity','apartment','speaking','word','alot','free','saw','true','room','opinion','learn']

In [16]:
sub_word = ['schizophrenia', 'schizophrenic', 'schizoaffective', 'bipolar']

def remove_words(df, column, words_to_remove):
    for word in words_to_remove:
        df[column] = df[column].str.replace(fr'\b{word}\b', '', regex=True)
    return df

def keep_words(df, column, words_to_keep):
    df[column] = df[column].apply(lambda x: ' '.join([word for word in str(x).split() if word in words_to_keep]))
    return df

In [17]:
df = keep_words(df, 'combined', relevant_words_bipolar+relevant_words_schizo)

In [18]:
df['list_words'] = df['processed_combined'].apply(listofwords)

In [19]:
df

Unnamed: 0,id,title,text,label,combined,processed_combined,subreddit,list_words
0,1409xqw,Mania Coping Skills,Many people experience increased mania during ...,0,mania summer coping,mania coping skill many people experience incr...,bipolar,"[mania, coping, skill, many, people, experienc..."
2,149pkfl,What are your telltale signs you’re going into...,I noticed today that I kept randomly laughing ...,0,mood hypomania,telltale sign going hypo manic episode noticed...,bipolar,"[telltale, sign, going, hypo, manic, episode, ..."
3,149odnt,How do you cope when you go from manic to depr...,I have been manic for two weeks and then yeste...,0,manic depressed manic depressed slept coping,cope go manic depressed overnight manic two we...,bipolar,"[cope, go, manic, depressed, overnight, manic,..."
4,1497cwy,Quit my job last week during an episode. Regre...,"School teacher. Love my class, but I have a co...",0,job week current week manic week manic week,quit job last week episode regretting school t...,bipolar,"[quit, job, last, week, episode, regretting, s..."
5,149a229,Relationships?,Does anyone feel like they will be a lone fore...,0,date,relationship anyone feel like lone forever try...,bipolar,"[relationship, anyone, feel, like, lone, forev..."
...,...,...,...,...,...,...,...,...
5893,9clec2,I think this should be a safe space again.,I used to love browsing this subreddit. Part a...,1,free delusional believe,think safe space used love browsing subreddit ...,schizophrenia,"[think, safe, space, used, love, browsing, sub..."
5898,v6fevb,I did it!,I struggled a lot with my weight over the year...,1,,struggled lot weight year especially med final...,schizophrenia,"[struggled, lot, weight, year, especially, med..."
5924,12oays0,the amount of this stuff I see is annoying.,does this make anyone else uncomfortable?,1,,amount stuff see annoying make anyone else unc...,schizophrenia,"[amount, stuff, see, annoying, make, anyone, e..."
5932,jcsgwz,Schizophrenia_specia is a scam,I need the mods to see this! \n\nThere is a pe...,1,listen,schizophrenia specia scam need mod see person ...,schizophrenia,"[schizophrenia, specia, scam, need, mod, see, ..."


In [20]:
df = remove_words(df, 'processed_combined', sub_word)

In [21]:
X = df['processed_combined']
y = df['subreddit']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [23]:
tfid = TfidfVectorizer(min_df=0.01, max_df=0.90)

In [24]:
X_train_tfid = tfid.fit_transform(X_train)
X_test_tfid = tfid.transform(X_test)

In [25]:
rev_words = relevant_words_bipolar+relevant_words_schizo

In [26]:
feature_names = tfid.get_feature_names_out()
indices_to_keep = [np.where(feature_names == word)[0][0] for word in rev_words if word in feature_names]

# Step 3: Transform your data
X_train_tfid = tfid.fit_transform(X_train)
X_test_tfid = tfid.transform(X_test)

# Step 4: Filter your data
X_train_tfid_filtered = X_train_tfid[:, indices_to_keep]
X_test_tfid_filtered = X_test_tfid[:, indices_to_keep]

In [27]:
X_train_df = pd.DataFrame.sparse.from_spmatrix(X_train_tfid_filtered, columns=[feature_names[i] for i in indices_to_keep])
X_train_df

Unnamed: 0,hypomania,hypomanic,lamictal,manic,mania,lithium,depressive,mixed,husband,swing,...,violent,apartment,speaking,word,free,saw,true,room,opinion,learn
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.072504,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.539321,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2807,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.195752,0.0,0.0,0.0
2808,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2809,0.327602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [28]:
X_train_df = pd.DataFrame(X_train_tfid.toarray(),
                          columns=tfid.get_feature_names_out())
X_train_df

Unnamed: 0,abilify,ability,able,absolutely,abuse,abusive,accept,account,across,act,...,wrote,www,yeah,year,yes,yesterday,yet,young,youtu,youtube
0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.039173,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.332184,0.223357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.095182,0.0,0.0,0.0
4,0.000000,0.000000,0.343864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2807,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2808,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.125273,0.0,0.0,0.000000,0.0,0.0,0.0
2809,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0


In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the models
model_lr = LogisticRegression()
model_nb = MultinomialNB()
model_svc = SVC()
model_rf = RandomForestClassifier()
model_dt = DecisionTreeClassifier(max_depth=20)
model_knn = KNeighborsClassifier(n_neighbors=31, weights = 'distance')
model_gb = GradientBoostingClassifier(n_estimators = 300)
model_ab = AdaBoostClassifier(n_estimators = 300)

# List of models
models = [model_lr, model_nb, model_svc, model_rf, model_dt, model_knn, model_gb, model_ab]

#test score by models
model_test_score = []
#models.append(model_vc)

# Loop over the models
for model in models:
    model.fit(X_train_tfid, y_train)
    y_pred = model.predict(X_test_tfid)
    print(f'Model: {model.__class__.__name__}, Train score is {model.score(X_train_tfid, y_train)}, Test score is {model.score(X_test_tfid, y_test)}')
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, columns = ['Pred B', 'Pred S'], index = ['Actual B','Actual S'])
    display(cm_df)
    print(classification_report(y_test, y_pred))
    print()
    model_test_score.append(model.score(X_test_tfid, y_test))
    
    # VotingClassifier for ensemble learning
model_vc = VotingClassifier(estimators=[('lr', model_lr), ('nb', model_nb), ('svc', model_svc), 
                                        ('rf', model_rf), ('dt', model_dt), ('knn', model_knn), 
                                        ('gb', model_gb), ('ab', model_ab)], voting='hard')

model_vc.fit(X_train_tfid, y_train)
y_pred = model_vc.predict(X_test_tfid)
print('Voting Classifier')
print(f'Train score is {model_vc.score(X_train_tfid, y_train)}, Test score is {model_vc.score(X_test_tfid, y_test)}')
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['Pred B', 'Pred S'], index = ['Actual B','Actual S'])
display(cm_df)
print(classification_report(y_test, y_pred))

Model: LogisticRegression, Train score is 0.8623265741728922, Test score is 0.7823613086770982


Unnamed: 0,Pred B,Pred S
Actual B,272,80
Actual S,73,278


               precision    recall  f1-score   support

      bipolar       0.79      0.77      0.78       352
schizophrenia       0.78      0.79      0.78       351

     accuracy                           0.78       703
    macro avg       0.78      0.78      0.78       703
 weighted avg       0.78      0.78      0.78       703


Model: MultinomialNB, Train score is 0.8263963002490217, Test score is 0.7980085348506402


Unnamed: 0,Pred B,Pred S
Actual B,298,54
Actual S,88,263


               precision    recall  f1-score   support

      bipolar       0.77      0.85      0.81       352
schizophrenia       0.83      0.75      0.79       351

     accuracy                           0.80       703
    macro avg       0.80      0.80      0.80       703
 weighted avg       0.80      0.80      0.80       703


Model: SVC, Train score is 0.9815012451085023, Test score is 0.7823613086770982


Unnamed: 0,Pred B,Pred S
Actual B,271,81
Actual S,72,279


               precision    recall  f1-score   support

      bipolar       0.79      0.77      0.78       352
schizophrenia       0.78      0.79      0.78       351

     accuracy                           0.78       703
    macro avg       0.78      0.78      0.78       703
 weighted avg       0.78      0.78      0.78       703


Model: RandomForestClassifier, Train score is 0.9989327641408752, Test score is 0.7681365576102418


Unnamed: 0,Pred B,Pred S
Actual B,279,73
Actual S,90,261


               precision    recall  f1-score   support

      bipolar       0.76      0.79      0.77       352
schizophrenia       0.78      0.74      0.76       351

     accuracy                           0.77       703
    macro avg       0.77      0.77      0.77       703
 weighted avg       0.77      0.77      0.77       703


Model: DecisionTreeClassifier, Train score is 0.8829598007826396, Test score is 0.7112375533428165


Unnamed: 0,Pred B,Pred S
Actual B,208,144
Actual S,59,292


               precision    recall  f1-score   support

      bipolar       0.78      0.59      0.67       352
schizophrenia       0.67      0.83      0.74       351

     accuracy                           0.71       703
    macro avg       0.72      0.71      0.71       703
 weighted avg       0.72      0.71      0.71       703


Model: KNeighborsClassifier, Train score is 0.9989327641408752, Test score is 0.7311522048364154


Unnamed: 0,Pred B,Pred S
Actual B,208,144
Actual S,45,306


               precision    recall  f1-score   support

      bipolar       0.82      0.59      0.69       352
schizophrenia       0.68      0.87      0.76       351

     accuracy                           0.73       703
    macro avg       0.75      0.73      0.73       703
 weighted avg       0.75      0.73      0.73       703


Model: GradientBoostingClassifier, Train score is 0.9580220562077553, Test score is 0.7581792318634424


Unnamed: 0,Pred B,Pred S
Actual B,257,95
Actual S,75,276


               precision    recall  f1-score   support

      bipolar       0.77      0.73      0.75       352
schizophrenia       0.74      0.79      0.76       351

     accuracy                           0.76       703
    macro avg       0.76      0.76      0.76       703
 weighted avg       0.76      0.76      0.76       703


Model: AdaBoostClassifier, Train score is 0.9555318392031306, Test score is 0.7254623044096729


Unnamed: 0,Pred B,Pred S
Actual B,264,88
Actual S,105,246


               precision    recall  f1-score   support

      bipolar       0.72      0.75      0.73       352
schizophrenia       0.74      0.70      0.72       351

     accuracy                           0.73       703
    macro avg       0.73      0.73      0.73       703
 weighted avg       0.73      0.73      0.73       703


Voting Classifier
Train score is 0.9857701885450018, Test score is 0.7965860597439545


Unnamed: 0,Pred B,Pred S
Actual B,278,74
Actual S,69,282


               precision    recall  f1-score   support

      bipolar       0.80      0.79      0.80       352
schizophrenia       0.79      0.80      0.80       351

     accuracy                           0.80       703
    macro avg       0.80      0.80      0.80       703
 weighted avg       0.80      0.80      0.80       703



---
Testing
--- 

In [31]:
#I randomly chose a post from r/bipolar and prepped it for prediction

model_test_df = pd.read_csv('test_20posts.csv')

In [32]:
model_test_df.head()

Unnamed: 0,Index,Post title,Post text,Subreddit
0,1,Wish I could stop laughing,My inappropriate laughter is fucking me. It fu...,schizophrenia
1,2,Freakin out,I’m diagnosed with psychosis but I’ve been hea...,schizophrenia
2,3,Does anyone get only auditory hallucinations?,"Unlike my other friend that has schizophrenia,...",schizophrenia
3,4,"how do you keep going with work, chores, erran...",its all so difficult. i recently (in the past ...,schizophrenia
4,5,Need advice on schizophrenic mother,Hello!\n\nI hope I'm not breaking any rules an...,schizophrenia


In [33]:
model_test_df['combined'] = model_test_df['Post title'] + model_test_df['Post text']

In [34]:
model_test_df['processed'] = model_test_df['combined'].apply(preprocesstext)

In [None]:
model_test_df.head()

Unnamed: 0,Index,Post title,Post text,Subreddit,combined,processed
0,1,Wish I could stop laughing,My inappropriate laughter is fucking me. It fu...,schizophrenia,Wish I could stop laughingMy inappropriate lau...,wish could stop laughingmy inappropriate laugh...
1,2,Freakin out,I’m diagnosed with psychosis but I’ve been hea...,schizophrenia,Freakin outI’m diagnosed with psychosis but I’...,freakin outi diagnosed psychosis hearing voice...
2,3,Does anyone get only auditory hallucinations?,"Unlike my other friend that has schizophrenia,...",schizophrenia,Does anyone get only auditory hallucinations?U...,anyone get auditory hallucination unlike frien...
3,4,"how do you keep going with work, chores, erran...",its all so difficult. i recently (in the past ...,schizophrenia,"how do you keep going with work, chores, erran...",keep going work chore errand bill meal hygiene...
4,5,Need advice on schizophrenic mother,Hello!\n\nI hope I'm not breaking any rules an...,schizophrenia,Need advice on schizophrenic motherHello!\n\nI...,need advice schizophrenic motherhello hope bre...


In [35]:
prediction = model_test_df['processed']

In [36]:
tfid.transform(prediction)

<40x1032 sparse matrix of type '<class 'numpy.float64'>'
	with 1718 stored elements in Compressed Sparse Row format>

In [37]:
predict_trans_df = pd.DataFrame(tfid.transform(prediction).toarray(),
                          columns=tfid.get_feature_names_out())

In [38]:
predict_trans_df.head()

Unnamed: 0,abilify,ability,able,absolutely,abuse,abusive,accept,account,across,act,...,wrote,www,yeah,year,yes,yesterday,yet,young,youtu,youtube
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.119936,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.183758,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.039738,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
model_vc.predict(tfid.transform(prediction))

array(['schizophrenia', 'schizophrenia', 'schizophrenia', 'bipolar',
       'schizophrenia', 'schizophrenia', 'schizophrenia', 'bipolar',
       'schizophrenia', 'schizophrenia', 'schizophrenia', 'bipolar',
       'schizophrenia', 'schizophrenia', 'schizophrenia', 'schizophrenia',
       'bipolar', 'schizophrenia', 'schizophrenia', 'schizophrenia',
       'bipolar', 'bipolar', 'bipolar', 'bipolar', 'bipolar', 'bipolar',
       'bipolar', 'bipolar', 'bipolar', 'bipolar', 'bipolar', 'bipolar',
       'bipolar', 'schizophrenia', 'bipolar', 'bipolar', 'bipolar',
       'bipolar', 'bipolar', 'bipolar'], dtype=object)

In [41]:
model_test_df['Predict'] = model_vc.predict(tfid.transform(prediction))

In [42]:
(model_test_df['Subreddit'] == model_test_df['Predict']).value_counts()

True     35
False     5
dtype: int64

In [None]:
import pickle

with open('testvotemodel.pkl', 'wb') as file:
    pickle.dump(model_vc, file)

In [None]:
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(tfid, file)