# <h1><center><span style='color:darkblue'>Welcome to the Interactive NLP Dashboard for Course Syllabi!</span></center></h1>

In [1]:
# If an error occurs in this cell, it is likely because you need to pip install a package(s) 
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('wordnet')
from nltk.corpus import stopwords
#from nltk.stem.lancaster import LancasterStemmer
import gensim.corpora as corpora
from pprint import pprint 
import ast
from glob import glob
from collections import Counter
import re
import matplotlib.pyplot as plt
import csv
import ipywidgets as widgets 
from ipywidgets import interact, interact_manual, HBox, interactive
#from wordcloud import WordCloud 
import seaborn as sns
from gensim.models.coherencemodel import CoherenceModel
from IPython.display import Markdown, display
import nltk
nltk.download('words')
nltk.download('stopwords')
def printmd(string):
    display(Markdown(string))

[nltk_data] Downloading package words to /Users/jordan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jordan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Set directory and get data - may need to modify this to wherever you have the data stored 
#os.chdir("/Users/jordan/Desktop/NLP_Syllabi_Project_copy/data_and_output")
data_orig = pd.read_csv("cleaned_data_new.csv", lineterminator='\n')
data_orig.drop(data_orig.columns[[0, 1, 2]], axis = 1, inplace = True) #drop meaningless indice columns 
data_orig['index'] = data_orig.index
# Create new data, just text and index 
map_names = data_orig[['title', 'File_name', 'index']] 
#print(data_orig.head())
#len(data_orig.index) #4889 documents 
data = data_orig[['Corpus', 'index']]

In [3]:
# This is where we set up the input approach (full syllabi vs. reduced syllabi (course descriptions) vs. bibliographic content)

# Further preprocessing (removes stop words and also words 3 characters or less
# and also non-english words) 
# Note: There was additional preprocessing done in previous steps (earlier scripts located in scripts folder)
words = set(nltk.corpus.words.words())
words.update(["africana"])
stop_words = stopwords.words('english')
def preprocess(text):
    result = []
    for token in text.split():
        if token not in stop_words and len(token) >3 and token in words:
            result.append(token)
    return result

#doc_sample = data[data['index'] == 1].values[0][0]
#print('original document: ')
#words = []
#for word in doc_sample.split(' '):
#    words.append(word)
#print(words)
#print('\n\n tokenized and document: ')
#print(preprocess(doc_sample))

processed_text = data['Corpus'].astype('str').map(preprocess)

In [4]:
# Create word lists for exploratory data analysis 

# Get all data row lists into one list 
all_words_list=[]
for index, row in processed_text.items():
    contri = row
    all_words_list.extend(contri)

# Get all data row lists with unique elements into one list (to see individual words by documents - removes duplicates within a single document)
all_docs_list=[]
for index, row in processed_text.items():
    contri = list(set(row))
    all_docs_list.extend(contri)

## <span style='color:darkblue'>Word Frequency Table</span>

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a frequency table for in descending order of frequency; for example, if you choose 25 for this parameter, then you will get a frequency table for the 25 most frequent words (in the order of 1-25, with 1 being most frequent)

**NOTE:** <span style='color:red'> Output is truncated for tables with more than 10k words </span>

In [5]:
# Explore word frequencies - this shows the proportion of times a word appears across all 4889 syllabi documents 
pd.set_option('display.max_rows', 10000) # change this as needed

def word_interacter(num=(1,22369,10)):
    y = Counter(all_docs_list)
    freq = y.most_common(num)
    df = pd.DataFrame(freq, columns=["Word", "Proportion"])
    df['Proportion'] = df['Proportion'].div(4889).round(2) 
    #df['Percentage'] = (df['Percentage']*100).astype('str').str.strip(".0") + "%"
    return df
uit = interact(word_interacter, num=(1, 22369, 10))
uit.widget.children[0].style= {'description_width': 'initial'}
uit.widget.children[0].description = "Number of Words" 
# To save
#df.to_csv("word_freq.csv")
# NOTE: Add a filter by proportion 

interactive(children=(IntSlider(value=11181, description='num', max=22369, min=1, step=10), Output()), _dom_cl…

In [6]:
## Word Cloud of Most Common Words

In [7]:
# Create word cloud to see a visual of most common words 
# Didn't make this interactive because it takes too long to load and refresh 
#long_string = ' '.join(all_words_list)
#wcloud = WordCloud(background_color="white", max_words = 5000, 
#                         contour_width = 3, contour_color='steelblue')
#wcloud.generate(long_string)
#wcloud.to_image()

## <span style='color:darkblue'>Histogram of Word Frequencies</span>

**README:** The <mark style="background-color: lightblue">Number of Words</mark> parameter will control the number of words you want a histogram of word frequencies for in descending order of frequency; so if you choose 25 for this parameter, then you will get a histogram for the 25 most frequent words (in the order of 1-25, with 1 being most frequent). In addition, the <mark style="background-color: lightblue">Bar Size</mark> parameter will control the appearance of the histogram, specifically it changes the **size** of the bars so that the chart can be more easily viewed. With less words (i.e., when you make the Number of Words parameter smaller), you will likely need to decrease the size of the bars; to do this, you will scroll the Bar Size parameter **to the left**. To zoom in and make the bars bigger, scroll the Bar Size parameter **to the right**. 

In [8]:
# Histogram of words (gives up to 1000 most common words)
%matplotlib inline
def histo_fun(Num_words=50, Zoom_image=20):
    sns.set(rc={"figure.figsize": (8, Zoom_image)})
    counter=Counter(all_words_list)
    most=counter.most_common(Num_words)
    x, y= [], []
    for word,count in most[:]:
        x.append(word)
        y.append(count)
    p = sns.barplot(x=y,y=x) 
    p.set_xlabel('Word Count')    
    p.xaxis.set_label_position('top') 
    p.xaxis.labelpad = 20
    p.xaxis.tick_top()
    plt.show()
    #return p 
uih = interact(histo_fun, Num_words=(1, 1000, 1), Zoom_image=(4, 500, 1))
uih.widget.children[0].style= {'description_width': 'initial'}
uih.widget.children[0].description = "Number of Words" 
uih.widget.children[1].style= {'description_width': 'initial'}
uih.widget.children[1].description = "Bar Size"
# Put x-axis on top 
# Also provide histogram of flip (least frequent)
# Add more sliders and make them more descriptive 

interactive(children=(IntSlider(value=50, description='Num_words', max=1000, min=1), IntSlider(value=20, descr…

In [9]:
# Create a dictionary
#dictionary = gensim.corpora.Dictionary(processed_text_new)
#count = 0 
#for k, v in dictionary.iteritems():
#    print(k, v)
#    count += 1
#    if count > 10:
#        break

## <span style='color:darkblue'>Topic Modeling via Latent Dirichlet Allocation (Top 10 Words Per Topic)</span> 

This is a three-step process that allows the user quite a bit of flexibility over how they analyze the data to come up with topics. Please follow the outlined steps and use the readme descriptions below.

### Step 1: Selecting Your Data

**README:** The 'Data' drop-down tab allows you to select the type of syllabi data you wish to work with. The two options are the full syllabi content (i.e., everything contained within syllabi) or just the course descriptions content. The default option is the full syllabi content. 

In [10]:
# Create option for text selection (Full Syllabi vs Reduced Syllabi vs. Bibliographic content (add that last one later))
def data_grab(column="Full Syllabi"):
    global processed_text
    global counter 
    if column=="Full Syllabi":
        data = data_orig[['Corpus', 'index']]
        processed_text = data['Corpus'].astype('str').map(preprocess)
        counter = "full syllabi"
        printmd("You are now analyzing the full syllabi content") 
    if column=="Course Descriptions Only":
        data = data_orig[['description', 'index']] 
        data = data.rename(columns = {"description" : "Corpus"})
        processed_text = data['Corpus'].astype('str').map(preprocess)
        counter = "course description"
        printmd("You are now analyzing just course description content") 
# Interactive piece 
ui_1 = interact(data_grab, column=widgets.Dropdown(options=["Full Syllabi","Course Descriptions Only"], description="Data")) 
#print(data.head())

interactive(children=(Dropdown(description='Data', options=('Full Syllabi', 'Course Descriptions Only'), value…

### Step 2: Selecting an Analytic Approach (Bag-of-Words vs. N-grams) 

**README:** The 'Options' drop-down box allows you to select from three options: Bag-of-Words, Bi-grams, and Tri-grams. Once you have selected an approach, hit the 'Apply Approach' button to apply this change. Note that you will need to select and apply an approach before you can run the topic model in Step 3. 


In [11]:
# Here we give options for different modeling approaches (BoW vs. N-grams)

# Main function that is interactive 
def data_chooser(approach="Bag-of-Words"):
    global processed_text_new
    global counter2
    def make_bigrams(texts):
        return [bigram_mod[itm] for itm in texts]
    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[itm]] for itm in texts]
    if approach=="Bag-of-Words":
        processed_text_new = processed_text
        counter2 = "Bag-of-Words model"
        printmd("You are now working with the Bag-of-Words data")
        #return processed_text_new
    if approach=="Bi-gram":
        bigram = gensim.models.Phrases(processed_text, min_count=20)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        processed_text_new = make_bigrams(processed_text)
        processed_text_new = pd.Series(processed_text_new)
        counter2 = "bi-gram model"
        printmd("You are now working with bi-gram data")
        #return processed_text_new 
    if approach=="Tri-gram":
        bigram = gensim.models.Phrases(processed_text, min_count=20)
        trigram = gensim.models.Phrases(bigram[processed_text], threshold=20)
        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)
        processed_text_new = make_trigrams(processed_text)
        processed_text_new = pd.Series(processed_text_new)
        counter2 = "tri-gram model"
        printmd("You are now working with tri-gram data")
        #return processed_text_new 
# Interactive piece 
ui_2 = interact_manual(data_chooser, approach=widgets.Dropdown(options=["Bag-of-Words","Bi-gram","Tri-gram"], description="Options"))
ui_2.widget.children[1].style= {'description_width': 'initial'}
ui_2.widget.children[1].description = "Apply Approach"

interactive(children=(Dropdown(description='Options', options=('Bag-of-Words', 'Bi-gram', 'Tri-gram'), value='…

### Step 3: Running the Topic Model 

**README:** The <mark style="background-color: lightblue">Lower Exclusion Filter</mark> parameter controls what words you want to exclude from the topic modeling analysis based on the number of documents words appear in (and it is a less than exclusion). For example, if you set this parameter to 25, then words appearing in less than 25 documents will be excluded from the analaysis (the idea is that you do want to exclude words that are too rare and won't contribute much to analysis). In addition, the <mark style="background-color: lightblue">Upper Exclusion Filter</mark> parameter will control what words you want to exclude from the topic modeling analysis based on the proportion of documents words appear in (and is a more than exclusion). For example, if you set this parameter to 0.35, then words appearing in more than 35% of the documents will be excluded from the analysis (the idea is that you do want to exclude words that are too frequent and thus may be common words that aren't too informative for our purposes but could dominate the analysis due to their frequency). Lastly, the <mark style="background-color: lightblue">Topic Number</mark> parameter controls the number of topics that you want the topic model to produce and output. 

**NOTE:** <span style='color:red'> Changing model parameters will produce new output and may take awhile to refresh.  </span>
    
**IMPORTANT:** <span style='color:red'> The specifications output reflects the options selected in Steps 1 and 2 </span>

In [14]:
# Interactive LDA modeling 
import threading
from IPython.display import display
import time

# function 
def output(low_freq, high_freq, topic_num):
    #time.sleep(3)
    progress = widgets.FloatProgress(value=0.0, min=0.0, max=1.0, description = "Loading:")
    finished = False
    def work(progress): 
        total = 200
        for i in range(total):
            if finished != True:
                time.sleep(0.2)
                progress.value = float(i+1)/total
            else:
                progress.value = 200
                progress.description = "Completed:"
                break
    try: 
        thread = threading.Thread(target=work, args=(progress,))
        display(progress)
        thread.start()
        dictionary = gensim.corpora.Dictionary(processed_text_new)
        dictionary.filter_extremes(no_below=low_freq, no_above = high_freq) #modify these 
        bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text_new]
        lda_model = gensim.models.LdaModel(bow_corpus, num_topics=topic_num, id2word=dictionary) 
        all_topics = lda_model.print_topics(topic_num)
        top_words_per_topic = []
        for t in range(lda_model.num_topics):
            y = [str(t)]
            x = lda_model.get_topic_terms(t,10)
            y.extend([dictionary[pair[0]] for pair in x])
            top_words_per_topic.append(y)
            y=[]
        top_words_per_topic
        word_list = ["Word" + str(i) for i in list(range(1,11))]
        word_list
        word_list.insert(0, "Topic")
        out = pd.DataFrame(top_words_per_topic, columns=word_list)
        #out.insert(0, "Topic")
        #out = out.set_index('Topic')
        docs_per_topic = [[] for _ in all_topics]
        for doc_id, doc_bow in enumerate(bow_corpus):
            doc_topics = lda_model.get_document_topics(doc_bow)
            for topic_id, score in doc_topics:
                docs_per_topic[topic_id].append((doc_id, score))
        for doc_list in docs_per_topic:
            doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)
        docs_per_topic = [item[:10] for item in docs_per_topic]
        data_list = []
        for lst in docs_per_topic:
            x = pd.DataFrame(lst, columns=["index", "probability"])
            y = pd.merge(x, map_names, how='left', on='index')
            y.drop(y.columns[[0,1]], axis=1, inplace=True)
            y = pd.DataFrame(y.stack())
            y = y.transpose()
            y = pd.DataFrame(y.values)
            data_list.append(y)
        df = pd.concat(data_list)
        df.columns=["syllabus 1", "filename 1", "syllabus 2", "filename 2", "syllabus 3", "filename 3", "syllabus 4", 
                                     "filename 4", "syllabus 5", "filename 5", "syllabus 6", "filename 6", "syllabus 7", "filename 7",
                                     "syllabus 8", "filename 8", "syllabus 9", "filename 9", "syllabus 10", "filename 10"]
        numbers = list(range(0,topic_num))
        numbers = [y for x in numbers for y in (x,)*1]
        df.insert(0, "Topic", numbers)
        out = out.rename_axis("Topic", axis="columns")
        out.drop("Topic", axis=1, inplace=True)
        df = df.reset_index(drop=True)
        df = df.rename_axis("Topic", axis="columns")
        df.drop("Topic", axis=1, inplace=True)
        # Coherence 
        cm = CoherenceModel(model=lda_model, corpus=bow_corpus, coherence='u_mass')
        coherence = round(cm.get_coherence(), 2) 
        #df = df.set_index('Topic')
        #syllabi = pd.DataFrame([t for lst in docs_per_topic for t in lst], columns=['index', 'probability'])
        #numbers = list(range(0,topic_num))
        #numbers = [y for x in numbers for y in (x,)*1]
        #syllabi.insert(0, "Topic", numbers)
        #syllabi = pd.merge(syllabi, map_names, how='left', on='index')
        #syllabi.drop(syllabi.columns[[1, 2]], axis = 1, inplace = True)
        #syllabi = syllabi.rename(columns={'title': 'Associated Syllabus'})
        #out['Topic']=out['Topic'].astype(int)
        #syllabi['Topic']=syllabi['Topic'].astype(int)
        #newout = pd.merge(out, syllabi, how='left', on='Topic')
        finished = True
        printmd("**SPECIFICATIONS:** You are analyzing the " + counter + " content with a " + counter2)
        print(" ")
        printmd("**Model Coherence:** " + str(coherence) + " (the higher the value, the better the model fit)")
        display(out) # use this to return other output
        print('')
        printmd("Now the top ten most associated syllabi for each topic with file names...")
        return df 
    except NameError:
        finished = True
        printmd("**<span style='color:red'>ERROR! You have not selected a modeling approach!</span>**")
        printmd("**<span style='color: red'>Please go back and apply a modeling approach in Step 2</span>**")
# Make it interactive 
ui_3 = interact_manual(output,low_freq=(1, 50, 1), high_freq=(0.05, 0.7, 0.05), topic_num = (10, 100, 10))
ui_3.widget.children[0].style= {'description_width': 'initial'}
ui_3.widget.children[0].description = "Lower Exclusion Filter" 
ui_3.widget.children[1].style= {'description_width': 'initial'}
ui_3.widget.children[1].description = "Upper Exclusion Filter"
ui_3.widget.children[2].style= {'description_width': 'initial'}
ui_3.widget.children[2].description = "Topic Number"
ui_3.widget.children[3].style= {'description_width': 'initial'}
ui_3.widget.children[3].description = "Run Model"

# NOTES FOR UPDATING THIS: 
# Make threshold displays consistent 
# Give some kind of loading bar 
# IMPORTANT: Provide list of documents (course topics/number/title; link to syllabus) that topics are associated with 
# Upload drive links into dataframe 
# Print out the parameters for ease of interpretation 
# See if there is an export exact settings option/capability 
# Add in more descriptions for the sliders and graphs 

interactive(children=(IntSlider(value=25, description='low_freq', max=50, min=1), FloatSlider(value=0.35000000…