# NLP Web App for Syllabi Data
resource: https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

https://stackoverflow.com/questions/32476336/how-to-abstract-bigram-topics-instead-of-unigrams-using-latent-dirichlet-allocat

https://www.analyticsvidhya.com/blog/2020/02/quick-introduction-bag-of-words-bow-tf-idf/

https://radimrehurek.com/gensim_3.8.3/auto_examples/tutorials/run_lda.html

# Put in Coherence value for each model as output and tell user what they are looking for 

In [1]:
# If an error occurs in this cell, it is likely because you need to pip install a package(s) 
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('wordnet')
from nltk.corpus import stopwords
#from nltk.stem.lancaster import LancasterStemmer
import gensim.corpora as corpora
from pprint import pprint 
import ast
from glob import glob
from collections import Counter
import re
import matplotlib.pyplot as plt
import csv
import ipywidgets as widgets 
from ipywidgets import interact, interact_manual, HBox, interactive
from wordcloud import WordCloud 
import seaborn as sns

In [2]:
# Set directory and get data - may need to modify this to wherever you have the data stored 
#os.chdir("/Users/jordan/Desktop/NLP_Syllabi_Project_copy/data_and_output")
data_orig = pd.read_csv("cleaned_data_new.csv", lineterminator='\n')
data_orig.drop(data_orig.columns[[0, 1, 2]], axis = 1, inplace = True) #drop meaningless indice columns 
data_orig['index'] = data_orig.index
# Create new data, just text and index 
map_names = data_orig[['title', 'File_name', 'index']] 
#print(data_orig.head())
#len(data_orig.index) #4889 documents 
data = data_orig[['Corpus', 'index']]

In [3]:
# This is where we set up the input approach (full syllabi vs. reduced syllabi (course descriptions) vs. bibliographic content)

# Further preprocessing (removes stop words and also words 3 characters or less
# and also non-english words) 
# Note: There was additional preprocessing done in previous steps (earlier scripts located in scripts folder)
words = set(nltk.corpus.words.words())
words.update(["africana"])
stop_words = stopwords.words('english')
def preprocess(text):
    result = []
    for token in text.split():
        if token not in stop_words and len(token) >3 and token in words:
            result.append(token)
    return result

#doc_sample = data[data['index'] == 1].values[0][0]
#print('original document: ')
#words = []
#for word in doc_sample.split(' '):
#    words.append(word)
#print(words)
#print('\n\n tokenized and document: ')
#print(preprocess(doc_sample))

processed_text = data['Corpus'].astype('str').map(preprocess)

In [4]:
# Create word lists for exploratory data analysis 

# Get all data row lists into one list 
all_words_list=[]
for index, row in processed_text.items():
    contri = row
    all_words_list.extend(contri)

# Get all data row lists with unique elements into one list (to see individual words by documents - removes duplicates within a single document)
all_docs_list=[]
for index, row in processed_text.items():
    contri = list(set(row))
    all_docs_list.extend(contri)

## Word Frequency Table

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a frequency table for in descending order of frequency; for example, if you choose 25 for this parameter, then you will get a frequency table for the 25 most frequent words (in the order of 1-25, with 1 being most frequent)

**NOTE:** <span style='color:red'> Output is truncated for tables with more than 10k words </span>

In [5]:
# Explore word frequencies - this shows the proportion of times a word appears across all 4889 syllabi documents 
pd.set_option('display.max_rows', 10000) # change this as needed

def word_interacter(num=(1,22369,10)):
    y = Counter(all_docs_list)
    freq = y.most_common(num)
    df = pd.DataFrame(freq, columns=["Word", "Proportion"])
    df['Proportion'] = df['Proportion'].div(4889).round(2) 
    #df['Percentage'] = (df['Percentage']*100).astype('str').str.strip(".0") + "%"
    return df
uit = interact(word_interacter, num=(1, 22369, 10))
uit.widget.children[0].style= {'description_width': 'initial'}
uit.widget.children[0].description = "Number of Words" 
# To save
#df.to_csv("word_freq.csv")
# NOTE: Add a filter by proportion 

interactive(children=(IntSlider(value=11181, description='num', max=22369, min=1, step=10), Output()), _dom_cl…

In [6]:
## Word Cloud of Most Common Words

In [7]:
# Create word cloud to see a visual of most common words 
# Didn't make this interactive because it takes too long to load and refresh 
#long_string = ' '.join(all_words_list)
#wcloud = WordCloud(background_color="white", max_words = 5000, 
#                         contour_width = 3, contour_color='steelblue')
#wcloud.generate(long_string)
#wcloud.to_image()

## Histogram of Word Frequencies 

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a histogram of word frequencies for in descending order of frequency; so if you choose 25 for this parameter, then you will get a histogram for the 25 most frequent words (in the order of 1-25, with 1 being most frequent). In addition, the <mark style="background-color: lightblue">Bar Size</mark> parameter will control the appearance of the histogram, specifically it changes the **size** of the bars so that the chart can be more easily viewed. With less words (i.e., when you make the Number of Words parameter smaller), you will likely need to decrease the size of the bars; to do this, you will scroll the Bar Size parameter **to the left**. To zoom in and make the bars bigger, scroll the Bar Size parameter **to the right**. 

In [8]:
# Histogram of words (gives up to 1000 most common words)
%matplotlib inline
def histo_fun(Num_words, Zoom_image):
    sns.set(rc={"figure.figsize": (8, Zoom_image)})
    counter=Counter(all_words_list)
    most=counter.most_common(Num_words)
    x, y= [], []
    for word,count in most[:]:
        x.append(word)
        y.append(count)
    p = sns.barplot(x=y,y=x) 
    p.set_xlabel('Word Count')    
    p.xaxis.set_label_position('top') 
    p.xaxis.labelpad = 20
    p.xaxis.tick_top()
    plt.show()
    #return p 
uih = interact(histo_fun, Num_words=(1, 1000, 1), Zoom_image=(4, 500, 1))
uih.widget.children[0].style= {'description_width': 'initial'}
uih.widget.children[0].description = "Number of Words" 
uih.widget.children[1].style= {'description_width': 'initial'}
uih.widget.children[1].description = "Bar Size"
# Put x-axis on top 
# Also provide histogram of flip (least frequent)
# Add more sliders and make them more descriptive 

interactive(children=(IntSlider(value=500, description='Num_words', max=1000, min=1), IntSlider(value=252, des…

In [9]:
# Create a dictionary
#dictionary = gensim.corpora.Dictionary(processed_text_new)
#count = 0 
#for k, v in dictionary.iteritems():
#    print(k, v)
#    count += 1
#    if count > 10:
#        break

## Topic Modeling via Latent Dirichlet Allocation (Top 10 Words Per Topic)

### This is a three step proccess that allows the user quite a bit of flexibility over how they analyze the data to come up with topics. Please follow the outlined steps and use the readme descriptions below! 

In [110]:
# Create option for text selection (Full Syllabi vs Reduced Syllabi vs. Bibliographic content (add that last one later))
#data = data_orig[['Corpus', 'index']] 
data = data_orig[['description', 'index']] 
data = data.rename(columns = {"description" : "Corpus"})
processed_text = data['Corpus'].astype('str').map(preprocess)

In [111]:
# Here we give options for different modeling approaches (BoW vs. N-grams)

# Main function that is interactive 

processed_text_new = processed_text

In [112]:
#processed_text_new

### Step 3: Running the Topic Model 

**README:** The <mark style="background-color: lightblue">Lower Exclusion Filter</mark> parameter controls what words you want to exclude from the topic modeling analysis based on the number of documents words appear in (and it is a less than exclusion). For example, if you set this parameter to 25, then words appearing in less than 25 documents will be excluded from the analaysis (the idea is that you do want to exclude words that are too rare and won't contribute much to analysis). In addition, the <mark style="background-color: lightblue">Upper Exclusion Filter</mark> parameter will control what words you want to exclude from the topic modeling analysis based on the proportion of documents words appear in (and is a more than exclusion). For example, if you set this parameter to 0.35, then words appearing in more than 35% of the documents will be excluded from the analysis (the idea is that you do want to exclude words that are too frequent and thus may be common words that aren't too informative for our purposes but could dominate the analysis due to their frequency). Lastly, the <mark style="background-color: lightblue">Topic Number</mark> parameter controls the number of topics that you want the topic model to produce and output. 

**NOTE:** <span style='color:red'> Changing model parameters will produce new output and may take awhile to refresh.  </span>
    
**IMPORTANT:** <span style='color:red'> MAKE SURE THE `SPECIFICATIONS` OUTPUT BELOW MATCHES WITH THE OPTIONS YOU SELECTED ABOVE FOR DATA REDUCTION AND ANALYSIS APPROACH!!  </span>

In [113]:
# Interactive LDA modeling 
import threading
from IPython.display import display
import time

# function 
dictionary = gensim.corpora.Dictionary(processed_text_new)

    
# NOTES FOR UPDATING THIS: 
# Make threshold displays consistent 
# Give some kind of loading bar 
# IMPORTANT: Provide list of documents (course topics/number/title; link to syllabus) that topics are associated with 
# Upload drive links into dataframe 
# Print out the parameters for ease of interpretation 
# See if there is an export exact settings option/capability 
# Add in more descriptions for the sliders and graphs 

In [114]:
dictionary.filter_extremes(no_below=25, no_above = 0.35) #modify these 

In [115]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text_new]

In [116]:
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=50, id2word=dictionary) 
lda_model

<gensim.models.ldamodel.LdaModel at 0x143ca86a0>

In [117]:
#lda_model.print_topics(50)

In [118]:
all_topics = lda_model.print_topics(50)
top_words_per_topic = []
for t in range(lda_model.num_topics):
    y = [str(t)]
    x = lda_model.get_topic_terms(t,10)
    y.extend([dictionary[pair[0]] for pair in x])
    top_words_per_topic.append(y)
    y=[]
#top_words_per_topic

In [119]:
word_list = ["Word" + str(i) for i in list(range(1,11))]
word_list
word_list.insert(0, "Topic")

In [120]:
#word_list

In [121]:
#top_words_per_topic

In [122]:
out = pd.DataFrame(top_words_per_topic, columns=word_list)
#out

In [123]:
docs_per_topic = [[] for _ in all_topics]

In [124]:
#docs_per_topic

In [125]:
for doc_id, doc_bow in enumerate(bow_corpus):
    doc_topics = lda_model.get_document_topics(doc_bow)
    for topic_id, score in doc_topics:
        docs_per_topic[topic_id].append((doc_id, score))

In [126]:
#docs_per_topic

In [127]:
for doc_list in docs_per_topic:
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [128]:
#doc_list

In [129]:
docs_per_topic = [item[:10] for item in docs_per_topic]

In [130]:
#docs_per_topic

In [132]:
data_list = []
for lst in docs_per_topic:
    x = pd.DataFrame(lst, columns=["index", "probability"])
    y = pd.merge(x, map_names, how='left', on='index')
    y.drop(y.columns[[0,1]], axis=1, inplace=True)
    y = pd.DataFrame(y.stack())
    y = y.transpose()
    y = pd.DataFrame(y.values, columns=["syllabus 1", "filename 1", "syllabus 2", "filename 2", "syllabus 3", "filename 3", "syllabus 4", 
                                     "filename 4", "syllabus 5", "filename 5", "syllabus 6", "filename 6", "syllabus 7", "filename 7",
                                     "syllabus 8", "filename 8", "syllabus 9", "filename 9", "syllabus 10", "filename 10"])
    data_list.append(y)

AssertionError: Number of manager items must equal union of block items
# manager items: 20, # tot_items: 18

In [205]:
x = pd.DataFrame(docs_per_topic[2], columns=["index", "probability"])
x

Unnamed: 0,index,probability
0,3380,0.959164
1,563,0.955453
2,564,0.955453
3,3005,0.955453
4,3006,0.955453
5,119,0.950997
6,2617,0.950997
7,713,0.94842
8,714,0.94842
9,1917,0.94842


In [206]:
y = pd.merge(x, map_names, how='left', on='index')
y

Unnamed: 0,index,probability,title,File_name
0,3380,0.959164,On the Dawn of Modernity,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf
1,563,0.955453,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
2,564,0.955453,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
3,3005,0.955453,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf
4,3006,0.955453,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf
5,119,0.950997,The Importance of Intellectual Property in Bio...,3DoLdm6pwe3vJLHc6PHsMHOIB3zQQi1aP1wetkkp.pdf
6,2617,0.950997,The Importance of Intellectual Property in Bio...,snCVIf6jPAJkiWkng1XXqhEvM1xDdckFtWsD0djP.pdf
7,713,0.94842,Third-Year Arabic,QRBNZ32VcDxOjuZjzvbLFB4LNH29GoofMdWjX7iA.pdf
8,714,0.94842,Third-Year Arabic,pkC6ENPWiOq1sGVhSLCz45WBC5A52Gw3eUZkok2W.pdf
9,1917,0.94842,Third-Year Arabic,VLN1KpsxKFfstxDw9whgSNohdnk5LmS687ZN7ObK.pdf


In [207]:
#data_list
y.drop(y.columns[[0,1]], axis=1, inplace=True)
y

Unnamed: 0,title,File_name
0,On the Dawn of Modernity,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf
1,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
2,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
3,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf
4,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf
5,The Importance of Intellectual Property in Bio...,3DoLdm6pwe3vJLHc6PHsMHOIB3zQQi1aP1wetkkp.pdf
6,The Importance of Intellectual Property in Bio...,snCVIf6jPAJkiWkng1XXqhEvM1xDdckFtWsD0djP.pdf
7,Third-Year Arabic,QRBNZ32VcDxOjuZjzvbLFB4LNH29GoofMdWjX7iA.pdf
8,Third-Year Arabic,pkC6ENPWiOq1sGVhSLCz45WBC5A52Gw3eUZkok2W.pdf
9,Third-Year Arabic,VLN1KpsxKFfstxDw9whgSNohdnk5LmS687ZN7ObK.pdf


In [208]:
y = pd.DataFrame(y.stack())
y

Unnamed: 0,Unnamed: 1,0
0,title,On the Dawn of Modernity
0,File_name,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf
1,title,Intermediate German I
1,File_name,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
2,title,Intermediate German I
2,File_name,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx
3,title,Intermediate German I
3,File_name,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf
4,title,Intermediate German I
4,File_name,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf


In [209]:
y = y.transpose()
y

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9
Unnamed: 0_level_1,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name,title,File_name
0,On the Dawn of Modernity,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,The Importance of Intellectual Property in Bio...,3DoLdm6pwe3vJLHc6PHsMHOIB3zQQi1aP1wetkkp.pdf,The Importance of Intellectual Property in Bio...,snCVIf6jPAJkiWkng1XXqhEvM1xDdckFtWsD0djP.pdf,Third-Year Arabic,QRBNZ32VcDxOjuZjzvbLFB4LNH29GoofMdWjX7iA.pdf,Third-Year Arabic,pkC6ENPWiOq1sGVhSLCz45WBC5A52Gw3eUZkok2W.pdf,Third-Year Arabic,VLN1KpsxKFfstxDw9whgSNohdnk5LmS687ZN7ObK.pdf


In [210]:
y = pd.DataFrame(y.values, columns=["syllabus 1", "filename 1", "syllabus 2", "filename 2", "syllabus 3", "filename 3", "syllabus 4", 
                                     "filename 4", "syllabus 5", "filename 5", "syllabus 6", "filename 6", "syllabus 7", "filename 7",
                                     "syllabus 8", "filename 8", "syllabus 9", "filename 9", "syllabus 10", "filename 10"])
y

Unnamed: 0,syllabus 1,filename 1,syllabus 2,filename 2,syllabus 3,filename 3,syllabus 4,filename 4,syllabus 5,filename 5,syllabus 6,filename 6,syllabus 7,filename 7,syllabus 8,filename 8,syllabus 9,filename 9,syllabus 10,filename 10
0,On the Dawn of Modernity,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,The Importance of Intellectual Property in Bio...,3DoLdm6pwe3vJLHc6PHsMHOIB3zQQi1aP1wetkkp.pdf,The Importance of Intellectual Property in Bio...,snCVIf6jPAJkiWkng1XXqhEvM1xDdckFtWsD0djP.pdf,Third-Year Arabic,QRBNZ32VcDxOjuZjzvbLFB4LNH29GoofMdWjX7iA.pdf,Third-Year Arabic,pkC6ENPWiOq1sGVhSLCz45WBC5A52Gw3eUZkok2W.pdf,Third-Year Arabic,VLN1KpsxKFfstxDw9whgSNohdnk5LmS687ZN7ObK.pdf


In [211]:
data_list.append(y)
#data_list
df = pd.concat(data_list)
df

Unnamed: 0,syllabus 1,filename 1,syllabus 2,filename 2,syllabus 3,filename 3,syllabus 4,filename 4,syllabus 5,filename 5,syllabus 6,filename 6,syllabus 7,filename 7,syllabus 8,filename 8,syllabus 9,filename 9,syllabus 10,filename 10
0,Critical Reading and Writing I: The Academic E...,1Gkhk3tXpUjzgrauzUXfiZ5h4n9nxUkkbg3NFeCH.docx,Critical Reading and Writing I: The Academic E...,EQd5VpCQb0MKlAUpleoGpKyR8qvfJiSw3lcLzoIt.pdf,Critical Reading and Writing I: The Academic E...,7yQzttE8qJclUbUNgWusHDH5kgdkJFCkePKnbNoW.docx,Critical Reading and Writing I: The Academic E...,LHtQVAFYujAXDKiYVEOUZj2yfoqpnH0QdoPWtepe.pdf,Critical Reading and Writing I: The Academic E...,VvK0hcrGvvV7tx6Fwm9NstDCcbsE9C6tQ6ZNuA2b.pdf,Critical Reading and Writing I: The Academic E...,fyFtBceWhAE0mdMFOzZ0iDfwqVf5XksRlR8DaH3H.docx,Critical Reading and Writing I: The Academic E...,khbiYPKff3NMBB5t7zEoe9uVGOPcmHVAd73TNSRd.docx,Critical Reading and Writing I: The Academic E...,5LHj0qB2O2VezwSAwURbit73Qe1Jl21EBZfwW4q6.docx,Critical Reading and Writing I: The Academic E...,m39BPfzp4hYSuj5G8pkscrzWdhWvUOAiFOHg5RBo.docx,Critical Reading and Writing I: The Academic E...,cTbdrZRHZTZrnWtrBwrE4jcQeI7FysSV12NbTr8e.pdf
0,Theory and Practice of Engaged Scholarship (ES...,YvbOMP7OFx3LNH6RXOHigs4BImavhfUzKALuzh6s.pdf,Theory and Practice of Engaged Scholarship (ES...,Kt8H0RrJdtn5kA8lqLNQHyd1AJxWOhaGyFIYBR6y.pdf,Theory and Practice of Engaged Scholarship (ES...,r0WxR8UL3ElGN0XxXSTWaCYUEZGrRHeVJBHdcWoM.pdf,Theory and Practice of Engaged Scholarship (ES...,n7hHuvywhLXfVhqY01S9tolfZhvIENzPxigNJKrs.pdf,Introduction to Acting and Directing,9W57sMnKVCAhIwCbx0WxJRbgAobBVELCkzrrYE4b.pdf,Introduction to Acting and Directing,iItoHLTQJmvMgF8HBomo3qzAGsdQftRsW4l5dnu5.pdf,Introduction to Acting and Directing,FIhnspvVnVbh8BezkR3NIt7YzgSuC9ie4bRSvkjA.pdf,Introduction to Acting and Directing,YauhF3Br0wtySYfHJh3cGETEljPJjqnVUS8y4UNd.pdf,Introduction to Acting and Directing,HEWv9dSjm0ppwPrNYQAbpRbmV7ieYtJbgS41Ltyj.pdf,Introduction to Acting and Directing,i6qJ8Z5HVPE4t3h78lPQ37U8ZjGYxTbvXwwc9iSj.pdf
0,On the Dawn of Modernity,QYyHUht1513rdnq4bTOCN8Cisi5ARbJJBvLpYNhf.pdf,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,ezhiG6p0IlOV0HyC36j0yS0U78MM4Jsl95KO7dxm.docx,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,Intermediate German I,jDtJUA1OYzcaKC8ElLtXX6BEr8OnTM062LIgraVG.pdf,The Importance of Intellectual Property in Bio...,3DoLdm6pwe3vJLHc6PHsMHOIB3zQQi1aP1wetkkp.pdf,The Importance of Intellectual Property in Bio...,snCVIf6jPAJkiWkng1XXqhEvM1xDdckFtWsD0djP.pdf,Third-Year Arabic,QRBNZ32VcDxOjuZjzvbLFB4LNH29GoofMdWjX7iA.pdf,Third-Year Arabic,pkC6ENPWiOq1sGVhSLCz45WBC5A52Gw3eUZkok2W.pdf,Third-Year Arabic,VLN1KpsxKFfstxDw9whgSNohdnk5LmS687ZN7ObK.pdf


In [190]:
#data_list=[]