In [34]:
import pandas as pd
from nltk.corpus import stopwords
from textblob import Word
from spacy.lang.en import English
import PyPDF2
from gensim import corpora
import gensim

In [35]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# Reading The Article
# nltk.download('stopwords')
# nltk.download('wordnet')

In [36]:
def read_article(article):
    pdfFileObj = open(article, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    print(f"Total number of pages: {pdfReader.numPages}")

    paper = ""
    for i in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(i)
        paper = paper + " " + pageObj.extractText()

    pdfFileObj.close()

    start = "abstract"
    end = "references"
    paper = paper.lower()
    paper = paper[paper.index(start):]  # kicking the entrances (before 'abstract' word)
    paper = paper[:paper.index(end)]  # kicking the last parts (after 'references' word)
    text = paper.split(".")
    return pd.DataFrame(text, columns=["text"])

In [37]:
def preparetion(paper):
    df = read_article(paper)
    sw = stopwords.words('english')
    df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)  # harf, sayı ve boşluk dışındakileri attık
    df['text'] = df['text'].str.replace(r'\d', '', regex=True)  # sayıları attık
    df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    df['text'] = df['text'].apply(lambda x: " ".join(i for i in str(x).split() if i not in sw))

    words = pd.Series(' '.join(df['text']).split()).value_counts()
    drop = words[words == 1]
    df['text'] = df['text'].apply(lambda x: " ".join(i for i in str(x).split() if i not in drop)) # Rarewords

    df = df[df["text"].apply(lambda x: len(x.split()) > 2)].reset_index(drop=True)  # Some Correlations
    return df


In [47]:
def tokenize(text):
    parser = English()
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return [token for token in lda_tokens if len(token) > 1]


In [48]:
text_data = []
df = preparetion(r"F:\playground\makaleler\1.pdf")
for line in df["text"]:
    tokens = tokenize(line)
    text_data.append(tokens)

Total number of pages: 13


In [59]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [66]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=15)
for topic in topics:
    print(topic)

(0, '0.055*"professional" + 0.048*"competence" + 0.031*"maritime" + 0.031*"room" + 0.030*"education" + 0.030*"higher" + 0.029*"marine" + 0.016*"technology" + 0.016*"escape" + 0.015*"wa" + 0.015*"first" + 0.015*"new" + 0.015*"form" + 0.015*"general" + 0.015*"code"')
(1, '0.041*"drop" + 0.041*"task" + 0.035*"image" + 0.035*"drag" + 0.034*"type" + 0.029*"text" + 0.029*"question" + 0.024*"name" + 0.020*"escape" + 0.018*"allows" + 0.018*"ship" + 0.018*"cadet" + 0.018*"lm" + 0.018*"moodle" + 0.018*"multiple"')
(2, '0.034*"education" + 0.031*"maritime" + 0.031*"cadet" + 0.027*"professional" + 0.024*"result" + 0.024*"security" + 0.024*"form" + 0.024*"training" + 0.021*"competence" + 0.016*"increase" + 0.016*"topic" + 0.016*"modern" + 0.016*"level" + 0.016*"facility" + 0.016*"ksma"')
(3, '0.055*"professional" + 0.038*"educational" + 0.031*"game" + 0.029*"marine" + 0.029*"future" + 0.022*"wa" + 0.019*"student" + 0.019*"based" + 0.019*"result" + 0.019*"international" + 0.019*"method" + 0.019*"mod

In [67]:
get_document_topics = ldamodel.get_document_topics(corpus[0])
print(get_document_topics)

[(6, 0.9249913)]


# görselleştirme

In [68]:
lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_viz)

  default_term_info = default_term_info.sort_values(


In [42]:
def dominant_topic(ldamodel,corpus,content):
     #Function to find the dominant topic in each query
     sent_topics_df = pd.DataFrame()
     # Get main topic in each query
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each query
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # => dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=30)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     contents = pd.Series(content)#noisy data
     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

In [43]:
df_dominant_topic = dominant_topic(ldamodel=ldamodel, corpus=corpus, content=df['text'])
df_dominant_topic.head(10)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,text
0,0,0.925,"environment, electronic, educational, use, inf...",paper gamification activity use educational el...
1,1,0.925,"gamification, material, escape, room, year, co...",gamification method example described gamifica...
2,3,0.9727,"educational, learning, process, technology, us...",characteristic traditional learning learning u...
3,7,0.9437,"professional, maritime, use, room, competence,...",paper also example gamificatio n activity base...
4,8,0.9357,"escape, educational, process, room, task, usin...",escape room activity article contains storytel...
5,0,0.9182,"environment, electronic, educational, use, inf...",question type paper drag drop text short answe...
6,1,0.925,"gamification, material, escape, room, year, co...",escape room activity wa done year cadet kherso...
7,2,0.85,"image, allow, type, question, knowledge, profe...",according received result knowledge quality
8,8,0.711,"escape, educational, process, room, task, usin...",gamification activity also done learning syste...
9,0,0.9308,"environment, electronic, educational, use, inf...",learning educational environment maritime high...


In [13]:
df["text"]

0      paper gamification activity use educational el...
1      gamification method example described gamifica...
2      characteristic traditional learning learning u...
3      paper also example gamificatio n activity base...
4      escape room activity article contains storytel...
                             ...                        
97          cadet would like perform gamification moodle
98                             comparison statistic year
99     gamification activity like digital escape room...
100    use gamification exercise form general profess...
101    research see educational process gamification ...
Name: text, Length: 102, dtype: object

In [69]:
pyLDAvis.save_html(lda_viz, 'output_filename.html')