In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    model_path = 'D:\Dataset\{0}\models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    model_path = '/Volumes/Dataset/{0}/models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    src_path = Path(project_path, 'src')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path, str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
import string
from gensim import corpora
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, utils, models, similarities
from collections import defaultdict
import pandas as pd
import string

In [None]:
dictionary = corpora.Dictionary()
lemma = WordNetLemmatizer()
punctuation = set(string.punctuation)
stoplist = set(stopwords.words('english'))

def remove_punctuation(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_numbers(text):
    return ''.join([char for char in text if not char.isdigit()])

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stoplist])

def remove_single_chars(text):
    return ' '.join([word for word in text.split() if len(word) > 3])

def lemmatize(text):
    return ' '.join([lemma.lemmatize(word) for word in text.split()])

def lower_case(text):
    return ' '.join([word.lower() for word in text.split()])

def clean_text(text):
    text = text.replace('\n', '')
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = remove_single_chars(text)
    text = lower_case(text)
    text = lemmatize(text)
    return text

In [None]:
import os
os.listdir(data_path)

In [None]:
data = pd.read_csv(Path(data_path, 'risk_title_body.csv'))
print(data.shape)
data.columns = ['index', 'risk_label', 'title_body']
data.drop(['index'], axis=1, inplace=True)
data.head()

In [None]:
content = [clean_text(post) for post in data['title_body']]
tokenizer = RegexpTokenizer(r'\w+')
tokens = [tokenizer.tokenize(post) for post in content]

In [None]:
# Basic passes
num_passes = 10
num_topics = 20
random_state = 7

# Execution
dictionary = corpora.Dictionary(tokens)
# Remove words that appear less than 5 times and that are in more than in 80% documents
dictionary.filter_extremes(no_below=10, no_above=0.7)
corpus = [dictionary.doc2bow(text.split()) for text in content]

In [None]:
# LDA Model
# lda = models.LdaModel(corpus, id2word=dictionary, random_state=random_state,
#                       num_topics=num_topics, passes=num_passes)
lda = models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, random_state=random_state, 
                          num_topics=num_topics, passes=num_passes, workers=16)

In [None]:
topics = lda.show_topics(num_topics=num_topics,formatted=False,num_words=100)
topics

In [None]:
coherence = lda.top_topics(corpus,dictionary=dictionary,topn=100)

In [None]:
topic2topkeywords = {}
topic2csb = {}
topic2keywords = {}
topic2csa = {}
num_topics =lda.num_topics
cnt =1
for ws in coherence:
    wset = set(w[1] for w in ws[0])
    topic2topkeywords[cnt] = wset # set with top keywords for topic
    topic2csb[cnt] = ws[1] #avg coherence scores for each topic
    cnt +=1
for ws in topics:
    # create a unique set of keywords for each topic
    wset = set(w[0]for w in ws[1])
    topic2keywords[ws[0]+1] = wset
    
for i in range(1,num_topics+1):
    for j in range(1,num_topics+1):  
        if topic2keywords[i].intersection(topic2topkeywords[j])==topic2keywords[i]:
            topic2csa[i] = topic2csb[j]
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2keywords.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2keywords.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData