# AIWR Project - Predicting mental illness from Reddit Posts using Similarity Measures

#### Team members:
1. Kshitij Prit Gopali - PES1UG19CS234
2. Navya Eedula - PES1UG19CS293
3. Neha Arun Angadi - PES1UG19CS294

# Import necessary libraries

In [2]:
import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from pprint import pprint
from functools import reduce
import string
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

In [3]:
# from google.colab import drive
# drive.mount("Drive")

# Download required modules

In [13]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# Dataset: Read and prepare dataframe

The dataset consists of 200 reddit posts from each of the subcategories - anxiety, BPD and PTSD.

In [20]:
# anxiety_2018 = pd.read_csv("anxiety_2018.csv",encoding='utf-8')
# anxiety_2019 = pd.read_csv("anxiety_2019.csv",encoding='utf-8')
# anxiety_2018 = anxiety_2018[['subreddit','post']]
# anxiety_2018 = anxiety_2018[:100]
# anxiety_2019 = anxiety_2019[['subreddit','post']]
# anxiety_2019 = anxiety_2019[:100]

# bpd_2018 = pd.read_csv("bpd_2018.csv",encoding='utf-8')
# bpd_2019 = pd.read_csv("bpd_2019.csv",encoding='utf-8')
# bpd_2018 = bpd_2018[['subreddit','post']]
# bpd_2018 = bpd_2018[:100]
# bpd_2019 = bpd_2019[['subreddit','post']]
# bpd_2019 = bpd_2019[:100]

# ptsd_2018 = pd.read_csv("ptsd_2018.csv",encoding='utf-8')
# ptsd_2019 = pd.read_csv("ptsd_2019.csv",encoding='utf-8')
# ptsd_2018 = ptsd_2018[['subreddit','post']]
# ptsd_2018 = ptsd_2018[:100]
# ptsd_2019 = ptsd_2019[['subreddit','post']]
# ptsd_2019 = ptsd_2019[:100]

In [21]:
anxiety_2018 = pd.read_csv("anxiety_2018.csv",encoding='utf-8')
anxiety_2019 = pd.read_csv("anxiety_2019.csv",encoding='utf-8')
anxiety_2018 = anxiety_2018[['subreddit','post']]
anxiety_2018 = anxiety_2018[200:235]
anxiety_2019 = anxiety_2019[['subreddit','post']]
anxiety_2019 = anxiety_2019[200:235]

bpd_2018 = pd.read_csv("bpd_2018.csv",encoding='utf-8')
bpd_2019 = pd.read_csv("bpd_2019.csv",encoding='utf-8')
bpd_2018 = bpd_2018[['subreddit','post']]
bpd_2018 = bpd_2018[200:235]
bpd_2019 = bpd_2019[['subreddit','post']]
bpd_2019 = bpd_2019[200:235]

ptsd_2018 = pd.read_csv("ptsd_2018.csv",encoding='utf-8')
ptsd_2019 = pd.read_csv("ptsd_2019.csv",encoding='utf-8')
ptsd_2018 = ptsd_2018[['subreddit','post']]
ptsd_2018 = ptsd_2018[200:235]
ptsd_2019 = ptsd_2019[['subreddit','post']]
ptsd_2019 = ptsd_2019[200:235]

In [22]:
array = np.concatenate((anxiety_2018,anxiety_2019,bpd_2018,bpd_2019,ptsd_2018,ptsd_2019))
df = pd.DataFrame(array, columns = ['subreddit','post'])
df

Unnamed: 0,subreddit,post
0,anxiety,Feel like life is over because I complimented ...
1,anxiety,Skin is crawling and weird sleeping Wellburtin...
2,anxiety,Anxiety: what helps and when to consult a psyc...
3,anxiety,Can you identify this feeling? I sometimes hav...
4,anxiety,I’m not sure why this happens. My sister plays...
...,...,...
205,ptsd,"PTSD episode on Dr. Phil So, recently theres t..."
206,ptsd,Did anyone else with PTSD who watched James' M...
207,ptsd,"I'm having bad episodes, and I am not sure why..."
208,ptsd,I am of value. My value is not something I hav...


# Label encoding for different illnesses

In [23]:
scale_mapper = {'anxiety' :0,
 'bpd':1,
 'ptsd':2}
df['subreddit'] = df['subreddit'].replace(scale_mapper)
df

Unnamed: 0,subreddit,post
0,0,Feel like life is over because I complimented ...
1,0,Skin is crawling and weird sleeping Wellburtin...
2,0,Anxiety: what helps and when to consult a psyc...
3,0,Can you identify this feeling? I sometimes hav...
4,0,I’m not sure why this happens. My sister plays...
...,...,...
205,2,"PTSD episode on Dr. Phil So, recently theres t..."
206,2,Did anyone else with PTSD who watched James' M...
207,2,"I'm having bad episodes, and I am not sure why..."
208,2,I am of value. My value is not something I hav...


# Data cleaning and preprocessing

This includes the following steps:
1. Removing punctuation, links, special characters
2. Stop words removal

In [7]:
pat1 = r"@[A-Za-z0-9'?\[\]]+"
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat =r'|'.join((pat1,pat2))
pat3 = r'[^a-zA-Z]'
combined_pat2 = r'|'.join((combined_pat,pat3))

In [8]:
res = []

for i in range(0, 600):
  
  # remove patterns that follow the regex mentioned
  tweets = re.sub(combined_pat2,' ',df['post'][i])
  
  # split sentence based on spaces(words)
  tweets = tweets.split()

  # remove stop words
  tweets = [word for word in tweets if word not in set(stopwords.words('english'))]
  
  tweets = ' '.join(tweets)
  res.append(tweets)

['Does anyone else like taking long walks snows Everything quieter',
 'Meditation making anxious I couple sessions day headspace Just opening app gets heart racing The silence I run worries drown It torture Just mind How long gets easier I two days right dreaded part day',
 'Rant anxiety meds I self medicating Benzodiazepines Valium First started cannabis nonsense substance face value ultimately terrible anxiety medication currently You simply dose stuff appropriately much causes exact symptoms trying treat tolerance buildup Queue Valium decided self medicate trying week Now twice week Now three times week Fuck Time take break take time research something right benzodiazepines I need know nah messed coping skills Take amount time looking benzo withdrawals list sure leave sour taste shit joke No withdrawals deadly alcohol The system drugs work body last thing someone anxiety messing way take things therapeutically short term condition short term Such incredibly important fragile system 

# Casefolding, Lemmatization and Stemming

In [9]:
casefolded = []
for tweet in res:
  casefolded.append(tweet.casefold())

lemmatizer = WordNetLemmatizer()
lmntz = [[lemmatizer.lemmatize(word) for word in word_tokenize(item)] for item in casefolded]
lemmatized = []
for item in lmntz:
  s = ""
  for words in item:
    s = s + words + " "
  lemmatized.append(s[:-1])

ps = PorterStemmer()
stemmed = []
stm = [[ps.stem(word) for word in word_tokenize(item)] for item in lemmatized]
for item in stm:
  s = ""
  for words in item:
    s = s + words + " "
  stemmed.append(s[:-1])

In [10]:
#df_new = pd.DataFrame(stemmed, columns = ['post_new'])
#df_new['target'] = df['subreddit'][:600]
#df_new.tail()
df['post_new'] = stemmed
df.head()

Unnamed: 0,subreddit,post,post_new
0,0,Does anyone else like taking long walks while ...,doe anyon els like take long walk snow everyth...
1,0,Meditation is making me anxious I do a couple ...,medit make anxiou i coupl session day headspac...
2,0,Rant about anxiety meds - I've been self medic...,rant anxieti med i self medic benzodiazepin va...
3,0,Some advice for my friend please? I dated this...,some advic friend pleas i date girl year move ...
4,0,Feeling like I’m back to where I started. I’ve...,feel like i back i start i feel lot better eve...


# tf-idf vectorization

In [11]:
tf_idf_vectorizor = TfidfVectorizer(stop_words = 'english',#tokenizer = tokenize_and_stem,
                             max_features = 20000)
tf_idf = tf_idf_vectorizor.fit_transform(df['post_new'])
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

In [12]:
tf_idf_res = pd.DataFrame(tf_idf_array, columns=tf_idf_vectorizor.get_feature_names())
tf_idf_res.head()

Unnamed: 0,aa,abandon,abc,abdomin,abhorr,abid,abil,abl,abort,abruptli,...,youngster,youth,youtub,yr,zero,zhuangzi,zofran,zoloft,zone,zyprexa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.105213,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tar_cols = []
for col in tf_idf_res.columns:
  tar_cols.append(col)
tar_cols

['aa',
 'abandon',
 'abc',
 'abdomin',
 'abhorr',
 'abid',
 'abil',
 'abl',
 'abort',
 'abruptli',
 'absent',
 'absolut',
 'abstract',
 'abus',
 'abyss',
 'academ',
 'accent',
 'accept',
 'access',
 'accid',
 'accident',
 'acclim',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accumul',
 'accur',
 'accus',
 'accusatori',
 'accustom',
 'ach',
 'achiev',
 'acid',
 'acknowledg',
 'acquaint',
 'act',
 'action',
 'activ',
 'actual',
 'actuali',
 'acut',
 'ad',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adhd',
 'adjust',
 'admin',
 'admiss',
 'admit',
 'admittedli',
 'adopt',
 'adrenalin',
 'adult',
 'adulthood',
 'advanc',
 'advantag',
 'adventur',
 'advic',
 'advis',
 'advoc',
 'af',
 'afab',
 'affair',
 'affect',
 'affili',
 'affirm',
 'afford',
 'afib',
 'afloat',
 'afraid',
 'afterlif',
 'afternoon',
 'afterward',
 'age',
 'agenc',
 'aggres',
 'aggress',
 'agit',
 'ago',
 'agoraphob',
 'agoraphobia',
 'agre',
 'agreement',
 'ah',
 'ahead',


# Preparation of inverted index

In [14]:
inv_ind2 = {}
for i in range(600):
  check = stemmed[i]
  for item in stemmed[i].split():
    if item in check:
      if item not in inv_ind2.keys():
        inv_ind2[item] = set()
      inv_ind2[item].add(i)
for i in sorted(inv_ind2.keys()):
  print(i, ":", inv_ind2[i])

a : {257, 130, 259, 8, 270, 144, 273, 530, 19, 22, 281, 537, 539, 544, 300, 45, 304, 432, 178, 179, 439, 184, 186, 444, 190, 446, 447, 321, 577, 68, 581, 586, 203, 587, 334, 463, 591, 212, 471, 217, 477, 478, 352, 353, 481, 482, 228, 487, 105, 241, 376, 508, 253}
aa : {594}
abandon : {519, 415, 395, 45, 397, 273, 598, 503, 347, 287}
abc : {376}
abdomin : {169}
abhorr : {2}
abid : {483}
abil : {307, 445}
abl : {384, 257, 387, 519, 9, 139, 524, 526, 145, 402, 531, 277, 533, 408, 537, 154, 538, 414, 420, 421, 294, 422, 553, 559, 569, 563, 53, 54, 55, 439, 185, 57, 58, 188, 567, 568, 191, 447, 449, 322, 451, 571, 572, 73, 457, 203, 461, 463, 591, 82, 595, 596, 469, 473, 220, 221, 476, 352, 101, 249, 359, 494, 112, 115, 243, 244, 371, 121, 506, 123, 253, 511}
abort : {169}
about : {481, 516, 360, 78, 400, 501, 216, 538, 571}
abruptli : {384, 18}
absent : {568, 519}
absolut : {384, 521, 138, 524, 145, 148, 149, 414, 294, 568, 313, 449, 451, 324, 197, 585, 229, 361, 369, 117, 506, 254}
abstra

came : {384, 519, 522, 524, 401, 19, 420, 422, 553, 426, 427, 300, 432, 561, 439, 184, 572, 327, 583, 203, 340, 214, 87, 470, 218, 350, 488, 489, 363, 493, 110, 367, 246}
camp : {96, 594}
campu : {203, 559, 414, 519}
can : {384, 390, 263, 518, 519, 270, 526, 402, 155, 27, 542, 415, 416, 418, 294, 295, 552, 44, 176, 436, 183, 60, 195, 451, 579, 326, 583, 203, 588, 212, 217, 348, 476, 477, 97, 101, 102, 488, 246, 377, 507, 253}
canada : {252, 406}
cancel : {294, 585, 489, 596, 569}
cancer : {493, 110, 16, 560, 470, 248, 218, 414}
candid : {442}
cane : {493}
cannabi : {2}
cant : {482, 198, 422, 240, 336, 306, 401, 466, 560, 312, 314, 253, 190}
capabl : {347, 259, 437, 445}
capac : {164, 247}
cape : {483}
captiv : {336}
car : {132, 518, 137, 394, 16, 414, 543, 415, 34, 423, 555, 430, 560, 563, 436, 565, 570, 576, 326, 582, 88, 482, 100, 248, 507}
card : {384, 481, 576, 518, 489, 430, 20, 215, 190}
care : {257, 258, 382, 390, 519, 9, 522, 267, 12, 397, 524, 273, 19, 276, 409, 414, 415, 291,

exactli : {130, 3, 196, 197, 102, 396, 334, 371, 570, 251, 188, 477}
exacto : {399}
exam : {480, 420, 580, 522, 275, 252, 92, 478}
exampl : {257, 581, 358, 390, 422, 106, 108, 13, 51, 148, 377}
exasper : {299}
except : {34, 482, 363, 346, 400, 506, 254, 151, 152, 126, 347, 184}
excess : {475, 276, 214}
exchang : {34, 476}
excit : {260, 198, 479, 171, 236, 586, 335, 338, 410, 95}
exclud : {9}
excus : {68, 516, 198, 202, 77, 110, 397, 558, 81, 467, 244, 283}
exercis : {168, 137, 203, 314, 179, 567, 56, 58}
exert : {369}
exhast : {401}
exhaust : {257, 265, 521, 11, 270, 402, 147, 277, 154, 290, 548, 550, 296, 310, 324, 455, 86, 346, 489, 242, 115, 372, 250}
exhibit : {359, 335, 295}
exist : {321, 4, 69, 396, 560, 561, 51, 476, 28}
existenti : {41, 21}
exit : {217, 171, 518}
expand : {390}
expect : {384, 416, 482, 519, 585, 396, 430, 527, 368, 337, 82, 212, 374, 476, 412, 349, 413, 447}
expens : {261, 78, 182, 376, 57, 285, 253}
experi : {516, 6, 518, 8, 9, 529, 18, 538, 28, 543, 37, 550, 

hyperdr : {79}
hypersensit : {422}
hypersomnia : {82}
hyperventil : {522, 454}
hypervigil : {402, 509, 511}
hypochondriac : {413, 470}
hypocrisi : {253}
hypocrit : {379}
hyster : {567}
i : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 112, 114, 115, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 1

notic : {128, 384, 258, 3, 387, 519, 396, 13, 271, 400, 401, 406, 422, 427, 176, 50, 444, 194, 89, 226, 483, 367, 116, 376, 253, 382}
nov : {458}
novemb : {128, 4, 390, 519, 565, 377}
now : {512, 2, 133, 390, 519, 137, 522, 526, 17, 275, 278, 538, 539, 291, 300, 557, 180, 188, 444, 455, 585, 203, 340, 468, 470, 217, 218, 347, 476, 356, 101, 361, 363, 237, 118, 252, 511}
nowaday : {17}
nowher : {65, 586, 116, 183, 58}
nsfw : {448, 558, 400, 561, 409, 539, 414}
numb : {576, 417, 418, 451, 356, 482, 358, 517, 43, 492, 556, 275, 243, 246, 444, 572}
number : {449, 164, 390, 493, 525, 401, 214, 190}
numer : {169, 132, 110}
nurs : {460, 493, 470, 152, 222, 415}
nurtur : {390}
nut : {130, 91, 222}
nutrit : {148}
nyc : {182}
nye : {384, 322, 227, 324, 394, 364, 77, 241, 19, 86, 376, 377, 61}
nyquil : {228}
o : {129}
obey : {448}
object : {449, 516, 400, 435, 511}
oblig : {579}
obnoxi : {534}
observ : {476}
obsess : {321, 290, 439, 359, 331, 332, 141, 307, 183, 379}
obtain : {406}
obv : {579}
ob

ran : {384, 487, 488, 436, 536, 377, 410, 190, 414}
randal : {479}
random : {128, 384, 416, 483, 164, 454, 365, 183, 345, 509}
randomli : {180, 597, 38}
rang : {549}
rank : {526}
rant : {385, 2, 414, 68, 420, 198, 521, 585, 78, 240, 126}
rape : {514, 387, 515, 546, 459, 272, 561, 533, 409, 218, 221, 575}
rapecounsel : {459}
rapid : {5}
rapidli : {460}
rapist : {459}
rare : {4, 424, 584, 490, 554, 88, 476, 316, 253, 287}
rarer : {316}
rat : {82}
rate : {128, 224, 291, 376, 237, 145, 470, 56}
rather : {449, 482, 483, 261, 136, 363, 12, 205, 369, 340, 21, 439, 476, 415}
ration : {580, 329, 307, 147, 563, 569, 476}
ratm : {307}
rattl : {69}
raw : {48}
razor : {377}
razorblad : {204}
rd : {237, 492, 93, 38}
re : {537}
reaaalllyy : {129}
reach : {36, 261, 166, 519, 139, 44, 508, 206, 307, 372, 21, 344, 314, 315, 348, 254}
react : {384, 387, 455, 427, 45, 557, 116, 117, 347}
reaction : {416, 257, 482, 163, 132, 105, 409, 521, 112, 208, 307, 563, 117, 217, 378, 539, 476, 29}
reactiv : {445}
re

tens : {142, 563, 183, 443, 541}
tension : {5}
tent : {465}
term : {257, 2, 130, 263, 530, 159, 416, 565, 321, 324, 325, 581, 471, 344, 478, 481, 113, 497, 506, 381}
termin : {390}
terribl : {2, 514, 516, 522, 524, 275, 409, 25, 33, 296, 430, 304, 178, 53, 568, 585, 203, 334, 477, 95, 235}
terrifi : {134, 139, 412, 162, 546, 292, 547, 422, 177, 184, 188, 575, 448, 67, 580, 198, 71, 209, 595, 596, 220, 104, 105, 489, 254}
terror : {416, 422, 455, 141, 430, 400, 372, 511}
terrorist : {458}
terryfi : {185}
test : {291, 133, 583, 105, 237, 493, 367, 87, 88, 57}
text : {384, 257, 260, 516, 390, 519, 12, 537, 290, 546, 563, 184, 572, 67, 585, 331, 79, 209, 214, 217, 481, 483, 488, 504, 107, 235, 367, 376, 377, 253}
textur : {568}
tf : {585, 17}
th : {259, 483, 583, 458, 427, 16, 182, 406, 58, 62}
than : {314}
thank : {519, 8, 523, 18, 20, 532, 533, 536, 540, 541, 31, 36, 41, 563, 564, 568, 57, 58, 62, 575, 576, 577, 579, 68, 73, 592, 82, 84, 87, 599, 91, 95, 99, 102, 110, 126, 127, 134, 148,

wire : {109}
wisdom : {73, 527}
wise : {36, 390, 426, 138, 18}
wish : {385, 386, 387, 390, 267, 140, 141, 396, 273, 274, 22, 278, 534, 25, 294, 44, 304, 196, 586, 335, 208, 339, 595, 357, 231, 505, 381, 254, 255}
wit : {384, 412, 390, 567}
with : {481, 376, 390, 270, 466, 212, 84, 119, 280, 94}
withdraw : {128, 2, 269, 46, 87, 444}
withdrawl : {444}
within : {576, 260, 485, 186, 110, 176, 272, 465, 83, 565, 278, 122, 253, 127}
without : {385, 382, 387, 390, 7, 518, 9, 394, 522, 269, 397, 399, 272, 400, 530, 21, 406, 535, 281, 154, 282, 541, 414, 287, 548, 293, 294, 554, 427, 45, 560, 561, 179, 307, 436, 311, 315, 62, 446, 449, 325, 454, 581, 205, 590, 463, 596, 469, 89, 476, 481, 482, 483, 491, 366, 239, 494, 496, 507, 376, 379, 126}
wobbl : {376}
woke : {547, 487, 585, 522, 205, 238, 372, 181, 374, 406, 376, 470, 283, 575}
woken : {585}
woman : {384, 426, 461, 557, 347, 147, 594, 565, 595, 216, 122, 283, 253}
wonder : {257, 259, 261, 6, 262, 518, 265, 532, 534, 409, 285, 158, 413, 416

# Word2vec application to prepare dictionary using reddit posts

In [94]:
import gensim
tokenized_tweet = df['post_new'].apply(lambda x: x.split()) 
model_w2v = gensim.models.Word2Vec(
             tokenized_tweet,
#             vector_size=200, # desired no. of features/independent variables 
             window=10, # context window size
#             min_count=2,
             sg = 1, # 1 for skip-gram model
             hs = 0,
#             negative = 10, # for negative sampling
             workers= 2, # no.of cores
             seed = 34)
# model_w2v = gensim.models.Word2Vec(
#             tokenized_tweet,
#              workers= 2, # no.of cores
#              seed = 34)

model_w2v.train(tokenized_tweet, total_examples= len(df['post_new']), epochs=20)

(875061, 1210900)

# Querying into the dictionary : Search engine

In [180]:
query_words = input("Enter space separated words: ")

pat1 = r"@[A-Za-z0-9'?\[\]]+"
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat =r'|'.join((pat1,pat2))
pat3 = r'[^a-zA-Z]'
combined_pat2 = r'|'.join((combined_pat,pat3))

res = []

  
  # remove patterns that follow the regex mentioned
tweets = re.sub(combined_pat2,' ',query_words)
  
  # split sentence based on spaces(words)
tweets = tweets.split()

  # remove stop words
tweets = [word for word in tweets if word not in set(stopwords.words('english'))]
  
tweets = ' '.join(tweets)
res.append(tweets)

casefolded1 = []
casefolded1.append(res[0].casefold())

lemmatizer = WordNetLemmatizer()
lmntz = [[lemmatizer.lemmatize(word) for word in word_tokenize(item)] for item in casefolded1]
lemmatized1 = []
for item in lmntz:
  s = ""
  for words in item:
    s = s + words + " "
  lemmatized1.append(s[:-1])

ps = PorterStemmer()
stemmed1 = []
stm = [[ps.stem(word) for word in word_tokenize(item)] for item in lemmatized1]
for item in stm:
  s = ""
  for words in item:
    s = s + words + " "
  stemmed1.append(s[:-1])

words = stemmed1[0].split()

Enter space separated words: i have frequent panic attacks nowadays apart from the fact that i have been suffering from insomnia for the past few years.


## Calculating cosine similarity between the vectors to find the words closest to the query from the dictionary

In [181]:
sim = {}

for word in words:
    try:
        sim[word] = model_w2v.wv.most_similar(positive = word)
    except:
        continue

for i in sim.keys():
  print(i, ":", sim[i])

sim_words = []
for key in sim.keys():
  for i in range(len(sim[key])):
    sim_words.append(sim[key][i][0])

# List of all doc ids containing the similar words    
doc_ids = []
for word in sim_words:
  for id in inv_ind2[word]:
    doc_ids.append(id)

frequent : [('restless', 0.6029412150382996), ('vomit', 0.5715353488922119), ('sensat', 0.5715275406837463), ('hallucin', 0.5701594352722168), ('skip', 0.557974636554718), ('veri', 0.5455983281135559), ('race', 0.5386227965354919), ('januari', 0.5376782417297363), ('nausea', 0.525105893611908), ('occur', 0.5207098722457886)]
panic : [('attack', 0.8221801519393921), ('blown', 0.5294482707977295), ('skip', 0.4734025001525879), ('paranoia', 0.4733980596065521), ('anxieti', 0.44609612226486206), ('remind', 0.4353194534778595), ('straight', 0.4336870610713959), ('restless', 0.4336566627025604), ('tens', 0.4316985309123993), ('vomit', 0.4293770492076874)]
attack : [('panic', 0.8221802115440369), ('blown', 0.5126924514770508), ('power', 0.5027793049812317), ('verg', 0.500765860080719), ('skip', 0.46527740359306335), ('incid', 0.4572601318359375), ('remind', 0.4442513883113861), ('dereal', 0.4362829029560089), ('restless', 0.43022531270980835), ('throat', 0.42617547512054443)]
apart : [('mile'

In [182]:
final = {}
illnesses = {0: 'anxiety', 1: 'bpd', 2: 'ptsd'}
for id in doc_ids:
  illness = df.iloc[id]['subreddit']
  if illness not in final.keys():
    final[illness] = 0
  final[illness] += 1

maxi = 0
ill = 0
for key in final.keys():
    if final[key] > maxi:
        maxi = final[key]
        ill = key
print(final)
# can calculate percentage of each illness the person suffers from when there are more illnesses trained for
print("The person is likely to suffer from :", illnesses[ill])

{0: 556, 2: 552, 1: 354}
The person is likely to suffer from : anxiety


## Queries and their results - 
1. drugs and violence: ptsd
2. i feel like murdering someone : bpd
3. i have frequent panic attacks nowadays apart from the fact that i have been suffering from insomnia for the past few years. : anxiety
4. i hate meeting new people : anxiety
5. i feel scared and anxious at social events : anxiety
6. i am helpless in social situations. I feel lonely and awkward at parties. : anxiety
7. my parents met with a car accident when I was young. that is why i do not drive. : ptsd
8. i feel insecure and worthless : bpd
9. i am so happy school is over. the people there used to make me feel horrible about myself : bpd
10.  i almost drowned when i was younger. that is why i hate going to the beach. : ptsd

In [24]:
df.to_csv (r'export_final_test.csv', index = False, header=True)
df

Unnamed: 0,subreddit,post
0,0,Feel like life is over because I complimented ...
1,0,Skin is crawling and weird sleeping Wellburtin...
2,0,Anxiety: what helps and when to consult a psyc...
3,0,Can you identify this feeling? I sometimes hav...
4,0,I’m not sure why this happens. My sister plays...
...,...,...
205,2,"PTSD episode on Dr. Phil So, recently theres t..."
206,2,Did anyone else with PTSD who watched James' M...
207,2,"I'm having bad episodes, and I am not sure why..."
208,2,I am of value. My value is not something I hav...
