# NLP Web App for Syllabi Data
resource: https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

https://stackoverflow.com/questions/32476336/how-to-abstract-bigram-topics-instead-of-unigrams-using-latent-dirichlet-allocat

https://www.analyticsvidhya.com/blog/2020/02/quick-introduction-bag-of-words-bow-tf-idf/

https://radimrehurek.com/gensim_3.8.3/auto_examples/tutorials/run_lda.html

In [2]:
# If an error occurs in this cell, it is likely because you need to pip install a package(s) 
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('wordnet')
from nltk.corpus import stopwords
#from nltk.stem.lancaster import LancasterStemmer
import gensim.corpora as corpora
from pprint import pprint 
import ast
from glob import glob
from collections import Counter
import re
import matplotlib.pyplot as plt
import csv
import ipywidgets as widgets 
from ipywidgets import interact, interact_manual, HBox, interactive
from wordcloud import WordCloud 
import seaborn as sns

In [3]:
# Set directory and get data - may need to modify this to wherever you have the data stored 
#os.chdir("/Users/jordan/Desktop/NLP_Syllabi_Project_copy/data_and_output")
data = pd.read_csv("cleaned_data_new.csv", lineterminator='\n')
data.drop(data.columns[[0, 1, 2]], axis = 1, inplace = True) #drop meaningless indice columns 
data['index'] = data.index
#print(data.head())
#len(data.index) #4889 documents 

In [4]:
# Create new data, just text and index 
map_names = data[['title', 'File_name', 'index']] 
data = data[['Corpus', 'index']] 
#print(data.head())

In [5]:
# Further preprocessing (removes stop words and also words 3 characters or less
# and also non-english words) 
# Note: There was additional preprocessing done in previous steps (earlier scripts located in scripts folder)
words = set(nltk.corpus.words.words())
words.update(["african-american", "africana"])
stop_words = stopwords.words('english')
def preprocess(text):
    result = []
    for token in text.split():
        if token not in stop_words and len(token) >3 and token in words:
            result.append(token)
    return result

#doc_sample = data[data['index'] == 1].values[0][0]
#print('original document: ')
#words = []
#for word in doc_sample.split(' '):
#    words.append(word)
#print(words)
#print('\n\n tokenized and document: ')
#print(preprocess(doc_sample))

processed_text = data['Corpus'].astype('str').map(preprocess)
#print(processed_text[:10])

In [6]:
# Create word lists for exploratory data analysis 

# Get all data row lists into one list 
all_words_list=[]
for index, row in processed_text.items():
    contri = row
    all_words_list.extend(contri)

# Get all data row lists with unique elements into one list (to see individual words by documents - removes duplicates within a single document)
all_docs_list=[]
for index, row in processed_text.items():
    contri = list(set(row))
    all_docs_list.extend(contri)

## Word Frequency Table

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a frequency table for in descending order of frequency; for example, if you choose 25 for this parameter, then you will get a frequency table for the 25 most frequent words (in the order of 1-25, with 1 being most frequent)

**NOTE:** <span style='color:red'> Output is truncated for tables with more than 10k words </span>

In [7]:
# Explore word frequencies - this shows the proportion of times a word appears across all 4889 syllabi documents 
pd.set_option('display.max_rows', 10000) # change this as needed

def word_interacter(num=(1,22369,10)):
    y = Counter(all_docs_list)
    freq = y.most_common(num)
    df = pd.DataFrame(freq, columns=["Word", "Proportion"])
    df['Proportion'] = df['Proportion'].div(4889).round(2) 
    #df['Percentage'] = (df['Percentage']*100).astype('str').str.strip(".0") + "%"
    return df
uit = interact(word_interacter, num=(1, 22369, 10))
uit.widget.children[0].style= {'description_width': 'initial'}
uit.widget.children[0].description = "Number of Words" 
# To save
#df.to_csv("word_freq.csv")
# NOTE: Add a filter by proportion 

interactive(children=(IntSlider(value=11181, description='num', max=22369, min=1, step=10), Output()), _dom_cl…

In [8]:
## Word Cloud of Most Common Words

In [9]:
# Create word cloud to see a visual of most common words 
# Didn't make this interactive because it takes too long to load and refresh 
#long_string = ' '.join(all_words_list)
#wcloud = WordCloud(background_color="white", max_words = 5000, 
#                         contour_width = 3, contour_color='steelblue')
#wcloud.generate(long_string)
#wcloud.to_image()

## Histogram of Word Frequencies 

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a histogram of word frequencies for in descending order of frequency; so if you choose 25 for this parameter, then you will get a histogram for the 25 most frequent words (in the order of 1-25, with 1 being most frequent). In addition, the <mark style="background-color: lightblue">Bar Size</mark> parameter will control the appearance of the histogram, specifically it changes the **size** of the bars so that the chart can be more easily viewed. With less words (i.e., when you make the Number of Words parameter smaller), you will likely need to decrease the size of the bars; to do this, you will scroll the Bar Size parameter **to the left**. To zoom in and make the bars bigger, scroll the Bar Size parameter **to the right**. 

In [10]:
# Histogram of words (gives up to 1000 most common words)
%matplotlib inline
def histo_fun(Num_words, Zoom_image):
    sns.set(rc={"figure.figsize": (8, Zoom_image)})
    counter=Counter(all_words_list)
    most=counter.most_common(Num_words)
    x, y= [], []
    for word,count in most[:]:
        x.append(word)
        y.append(count)
    p = sns.barplot(x=y,y=x) 
    p.set_xlabel('Word Count')    
    p.xaxis.set_label_position('top') 
    p.xaxis.labelpad = 20
    p.xaxis.tick_top()
    plt.show()
    #return p 
uih = interact(histo_fun, Num_words=(1, 1000, 1), Zoom_image=(4, 500, 1))
uih.widget.children[0].style= {'description_width': 'initial'}
uih.widget.children[0].description = "Number of Words" 
uih.widget.children[1].style= {'description_width': 'initial'}
uih.widget.children[1].description = "Bar Size"
# Put x-axis on top 
# Also provide histogram of flip (least frequent)
# Add more sliders and make them more descriptive 

interactive(children=(IntSlider(value=500, description='Num_words', max=1000, min=1), IntSlider(value=252, des…

In [11]:
# Create a dictionary
dictionary = gensim.corpora.Dictionary(processed_text)
#count = 0 
#for k, v in dictionary.iteritems():
#    print(k, v)
#    count += 1
#    if count > 10:
#        break

## Topic Modeling via Bag-of-Words (Top 10 Words Per Topic)

**README:** The <mark style="background-color: lightblue">Lower Exclusion Filter</mark> parameter controls what words you want to exclude from the topic modeling analysis based on the number of documents words appear in (and it is a less than exclusion). For example, if you set this parameter to 25, then words appearing in less than 25 documents will be excluded from the analaysis (the idea is that you do want to exclude words that are too rare and won't contribute much to analysis). In addition, the <mark style="background-color: lightblue">Upper Exclusion Filter</mark> parameter will control what words you want to exclude from the topic modeling analysis based on the proportion of documents words appear in (and is a more than exclusion). For example, if you set this parameter to 0.35, then words appearing in more than 35% of the documents will be excluded from the analaysis (the idea is that you do want to exclude words that are too frequent and thus may be common words that aren't too informative for our purposes but could dominate the analysis due to their frequency). Lastly, the <mark style="background-color: lightblue">Topic Number</mark> parameter controls the number of topics that you want the topic model to produce and output. 

**NOTE:** <span style='color:red'> Changing model parameters will produce new output and may take awhile to refresh </span>

In [12]:
# Interactive LDA modeling 
import threading
from IPython.display import display
import time

def output(low_freq, high_freq, topic_num):
    #time.sleep(3)
    progress = widgets.FloatProgress(value=0.0, min=0.0, max=1.0, description = "Loading:")
    finished = False
    def work(progress): 
        total = 200
        for i in range(total):
            if finished != True:
                time.sleep(0.2)
                progress.value = float(i+1)/total
            else:
                progress.value = 200
                progress.description = "Completed:"
                break
    thread = threading.Thread(target=work, args=(progress,))
    display(progress)
    thread.start()
    dictionary.filter_extremes(no_below=low_freq, no_above = high_freq) #modify these 
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text]
    lda_model = gensim.models.LdaModel(bow_corpus, num_topics=topic_num, id2word=dictionary) 
    all_topics = lda_model.print_topics(topic_num)
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        y = [str(t)]
        x = lda_model.get_topic_terms(t,10)
        y.extend([dictionary[pair[0]] for pair in x])
        top_words_per_topic.append(y)
        y=[]
    top_words_per_topic
    word_list = ["Word" + str(i) for i in list(range(1,11))]
    word_list
    word_list.insert(0, "Topic")
    out = pd.DataFrame(top_words_per_topic, columns=word_list)
    docs_per_topic = [[] for _ in all_topics]
    for doc_id, doc_bow in enumerate(bow_corpus):
        doc_topics = lda_model.get_document_topics(doc_bow)
        for topic_id, score in doc_topics:
            docs_per_topic[topic_id].append((doc_id, score))
    for doc_list in docs_per_topic:
        doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)
    docs_per_topic = [item[:1] for item in docs_per_topic]
    syllabi = pd.DataFrame([t for lst in docs_per_topic for t in lst], columns=['index', 'probability'])
    numbers = list(range(0,topic_num))
    numbers = [y for x in numbers for y in (x,)*1]
    syllabi.insert(0, "Topic", numbers)
    syllabi = pd.merge(syllabi, map_names, how='left', on='index')
    syllabi.drop(syllabi.columns[[1, 2]], axis = 1, inplace = True)
    syllabi = syllabi.rename(columns={'title': 'Associated Syllabus'})
    out['Topic']=out['Topic'].astype(int)
    syllabi['Topic']=syllabi['Topic'].astype(int)
    newout = pd.merge(out, syllabi, how='left', on='Topic')
    finished = True
    return newout
ui = interact_manual(output,low_freq=(1, 50, 1), high_freq=(0.05, 0.7, 0.05), topic_num = (10, 100, 10))
ui.widget.children[0].style= {'description_width': 'initial'}
ui.widget.children[0].description = "Lower Exclusion Filter" 
ui.widget.children[1].style= {'description_width': 'initial'}
ui.widget.children[1].description = "Upper Exclusion Filter"
ui.widget.children[2].style= {'description_width': 'initial'}
ui.widget.children[2].description = "Topic Number"
ui.widget.children[3].style= {'description_width': 'initial'}
ui.widget.children[3].description = "Run Model"

# NOTES FOR UPDATING THIS: 
# Make threshold displays consistent 
# Give some kind of loading bar 
# IMPORTANT: Provide list of documents (course topics/number/title; link to syllabus) that topics are associated with 
# Upload drive links into dataframe 
# Print out the parameters for ease of interpretation 
# See if there is an export exact settings option/capability 
# Add in more descriptions for the sliders and graphs 

interactive(children=(IntSlider(value=25, description='low_freq', max=50, min=1), FloatSlider(value=0.35000000…

In [17]:
dictionary.filter_extremes(no_below=10, no_above = 0.5) #modify these 
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text]
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary) 

In [18]:
lda_model.get_topic_terms(1,10)

[(175, 0.0056847706),
 (118, 0.00557258),
 (2237, 0.00534765),
 (1169, 0.0051530767),
 (526, 0.004919075),
 (1068, 0.0044559077),
 (420, 0.003832775),
 (536, 0.003721289),
 (216, 0.0037066399),
 (1509, 0.0034964767)]

In [19]:
all_topics = lda_model.print_topics(10)
top_words_per_topic = []
for t in range(lda_model.num_topics):
    y = [str(t)]
    x = lda_model.get_topic_terms(t,10)
    y.extend([dictionary[pair[0]] for pair in x])
    top_words_per_topic.append(y)
    y=[]
top_words_per_topic
word_list = ["Word" + str(i) for i in list(range(1,11))]
word_list
word_list.insert(0, "Topic")
out = pd.DataFrame(top_words_per_topic, columns=word_list)
out # gives results without files 

Unnamed: 0,Topic,Word1,Word2,Word3,Word4,Word5,Word6,Word7,Word8,Word9,Word10
0,0,homework,exam,instructor,problem,language,material,textbook,written,part,attendance
1,1,history,essay,care,press,policy,health,york,read,literature,literary
2,2,political,chapter,theory,press,politics,science,march,read,social,policy
3,3,homework,exam,physics,language,unit,material,lecture,lesson,help,section
4,4,instructor,read,language,homework,make,note,assignment,grammar,conversation,well
5,5,assignment,workshop,read,homework,performance,group,project,written,three,scene
6,6,history,press,world,social,journal,modern,york,culture,international,politics
7,7,health,media,public,digital,project,section,game,journal,assignment,video
8,8,data,exam,project,analysis,chapter,health,group,social,presentation,homework
9,9,project,data,black,assignment,group,march,lecture,press,race,spring


In [21]:
docs_per_topic = [[] for _ in all_topics]
for doc_id, doc_bow in enumerate(bow_corpus):
    doc_topics = lda_model.get_document_topics(doc_bow)
    for topic_id, score in doc_topics:
        docs_per_topic[topic_id].append((doc_id, score))
# docs_per_topic #this gives you per topic, each documents prob associated with it 

[[(16, 0.014698016),
  (22, 0.12368267),
  (25, 0.017315991),
  (28, 0.19334552),
  (30, 0.52653784),
  (44, 0.69324607),
  (45, 0.6806296),
  (46, 0.19838734),
  (47, 0.1),
  (48, 0.16937852),
  (49, 0.38397455),
  (50, 0.23769383),
  (51, 0.520682),
  (54, 0.09862147),
  (57, 0.1),
  (58, 0.28004062),
  (62, 0.25817454),
  (63, 0.044584956),
  (65, 0.1),
  (68, 0.1),
  (69, 0.1),
  (73, 0.1),
  (76, 0.1),
  (78, 0.4097525),
  (81, 0.22731945),
  (83, 0.14545038),
  (84, 0.15949428),
  (85, 0.22948427),
  (87, 0.011768),
  (88, 0.1),
  (89, 0.1),
  (90, 0.1),
  (91, 0.1),
  (92, 0.1),
  (93, 0.1),
  (94, 0.1),
  (95, 0.1),
  (96, 0.1),
  (97, 0.1),
  (98, 0.1),
  (99, 0.11860191),
  (100, 0.12566507),
  (101, 0.19509496),
  (102, 0.11064395),
  (116, 0.03927571),
  (118, 0.13101995),
  (120, 0.9215487),
  (121, 0.9224043),
  (125, 0.08154353),
  (126, 0.010069244),
  (129, 0.26227626),
  (138, 0.1),
  (150, 0.37809655),
  (151, 0.40770537),
  (152, 0.35558334),
  (153, 0.6778551),
  (

In [None]:
 all_topics = lda_model.print_topics(topic_num)
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        y = [str(t)]
        x = lda_model.get_topic_terms(t,10)
        y.extend([dictionary[pair[0]] for pair in x])
        top_words_per_topic.append(y)
        y=[]
    top_words_per_topic
    word_list = ["Word" + str(i) for i in list(range(1,11))]
    word_list
    word_list.insert(0, "Topic")
    out = pd.DataFrame(top_words_per_topic, columns=word_list)
    docs_per_topic = [[] for _ in all_topics]
    for doc_id, doc_bow in enumerate(bow_corpus):
        doc_topics = lda_model.get_document_topics(doc_bow)
        for topic_id, score in doc_topics:
            docs_per_topic[topic_id].append((doc_id, score))
    for doc_list in docs_per_topic:
        doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)
    docs_per_topic = [item[:1] for item in docs_per_topic]
    syllabi = pd.DataFrame([t for lst in docs_per_topic for t in lst], columns=['index', 'probability'])
    numbers = list(range(0,topic_num))
    numbers = [y for x in numbers for y in (x,)*1]
    syllabi.insert(0, "Topic", numbers)
    syllabi = pd.merge(syllabi, map_names, how='left', on='index')
    syllabi.drop(syllabi.columns[[1, 2]], axis = 1, inplace = True)
    syllabi = syllabi.rename(columns={'title': 'Associated Syllabus'})
    out['Topic']=out['Topic'].astype(int)
    syllabi['Topic']=syllabi['Topic'].astype(int)
    newout = pd.merge(out, syllabi, how='left', on='Topic')