In [None]:
from datetime import date
import pandas as pd
import re
import requests

## Pull data

In [None]:
# As of Feb 4, 2021, the endpoint returns workshop data from the last two years - ~1300 nodes
url = "https://www.lib.ncsu.edu/api/workshops/all"

In [None]:
user_activity_url = "https://www.lib.ncsu.edu/api/user_activities/all"

In [None]:
r = requests.get(url)
r

In [None]:
data = r.json()

In [None]:
data[0]

In [None]:
df = pd.DataFrame(data)
df.head()

Strip white spaces from the `field_non_library_instructor` column. 

In [None]:
df["field_non_library_instructor"] = df["field_non_library_instructor"].str.strip()
df.head()

Remove HTML tags from `body`.

In [None]:
def strip_html_tags(text: str) -> str:
    tag = re.compile("<.*?>")
    return re.sub(tag, "", text)


In [None]:
df["body"] = df["body"].apply(strip_html_tags)
df.head()

Write to csv, with date.

In [None]:
df.to_csv(f"all-workshops-{date.today()}.csv")

## Explore text

TODO:
- Consistency across workshops
    - Length of description
    - Consistent keywords?
    - Does title reflect body paragraph? (check keywords)
        - Does body contain words from the title? Does title contain words from body?
    - Consistency of workshop descriptions across related titles
        - If we have multiple introductions to python, are the descriptions similar?
        - Similar across intro to R vs intro python? 
- Lexical variety?


In [None]:
from nltk import word_tokenize
import numpy as np
import spacy

Read in the most current csv of workshop data. We'll hardcode it for now, but could always parse the filenames, and get the most recent. 

In [None]:
df = pd.read_csv("current-workshops-2021-01-20.csv")

In [None]:
def naive_count_words(text: str) -> int:
    return len([word for word in word_tokenize(text)
        if word not in [".", ",", "?", "'", "\"", ":", ";"]])

In [None]:
naive_count_words(df["body"][0])

We've done a naive pass to check word count minus punctuation. Let's compare it to spaCy's parsing, removing punctuation and spaces/newlines.

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def spacy_count_words(text: str) -> int:
    doc = nlp(text)
    no_punct = [t for t in doc if t.is_punct == False]
    no_space = [t for t in no_punct if t.is_space == False]
    return len(no_space) 

In [None]:
spacy_count_words(df["body"][0])

Create a new column in the dataframe that contains number of words in the description(`body`).

In [None]:
df["body_word_count"] = df["body"].apply(spacy_count_words)

In [None]:
df.head()

In [None]:
avg_word_count = np.mean(df["body_word_count"])
avg_word_count

In [None]:
lowest_word_count = df["body_word_count"].min()
lowest_word_count

In [None]:
highest_word_count = df["body_word_count"].max()
highest_word_count

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Histogram of word counts to see distribution
df["body_word_count"].hist()

Let's figure out the most common words across the current future workshops. 

In [None]:
from collections import Counter
import textacy

In [None]:
corpus = textacy.Corpus(nlp, df["body"].to_list())
corpus

In [None]:
corpus.n_docs

In [None]:
corpus.n_tokens

In [None]:
counts = corpus.word_counts(by="lower_")

In [None]:
sorted(counts.items(), key=lambda x: x[1], reverse=True)[:20]

In [None]:
counts["python"]

In [None]:
counts["git"]

How much of this is a result of some descriptions being substantially longer than others?

TODO: check for how many workshops these commonly used terms appear in. 

In [None]:
def check_inclusion_nums(token, corpus):
    count = 0
    for doc in corpus:
        if token in [token.text for token in doc]:
            count += 1
    return count

In [None]:
check_inclusion_nums("IRB", corpus)

TODO: Write a function that takes a term and a corpus, and returns all the docs that have that term

Check to see if textacy already has this...

## Generate keywords for each description

In [None]:
from textacy import extract
from itertools import chain

In [None]:
textacy.extract.keyterms.sgrank(corpus[20])

In [None]:
def get_keywords(text, spacy_model):
    doc = spacy_model(text)
    key_tuples = textacy.extract.keyterms.sgrank(doc)
    return [k for k, _ in key_tuples]

In [None]:
df["keywords"] = df["body"].apply(get_keywords, spacy_model=nlp)

In [None]:
df.head()

In [None]:
df.to_csv("current-workshops-anno.csv")

In [None]:
for i in range(len(df["title"])):
    print(df["title"][i], "--", df["keywords"][i])

## Titles and descriptions

Let's find out whether the descriptions use the same words as the title. 

In [None]:
def check_title_body(title, body):
    title_words = title.split()
    words_in_body = []
    for word in title_words:
        if word in body:
            words_in_body.append(word)
    return words_in_body

In [None]:
check_title_body(df["title"][0], df["body"][0])

In [None]:
title_words_in_body = [check_title_body(df["title"][i], df["body"][i]) for i in range(len(df["title"]))]
df["title_words_in_body"] = title_words_in_body

In [None]:
df.head()

In [None]:
df.to_csv("current-workshops-anno.csv")

## Text summarization with transformers

Many of the descriptions are already short enough that they don't need summarization, but what can we see?

https://huggingface.co/transformers/main_classes/pipelines.html
https://huggingface.co/transformers/main_classes/pipelines.html#transformers.SummarizationPipeline

In [None]:
from transformers import pipeline
from transformers import AutoModelWithLMHead, AutoTokenizer

In [None]:
summarizer = pipeline("summarization")

In [None]:
summarizer(df["body"][63])

In [None]:
df["body"][63]

In [None]:
summarizert5 = pipeline("summarization", model="t5-base", tokenizer="t5-base")

In [None]:
summarizert5(df["body"][63], max_length=40)

In [None]:
model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [None]:
inputs = tokenizer.encode("summarize: " + df["body"][63], return_tensors="pt", max_length=512)
outputs = model.generate(inputs, max_length=120, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

In [None]:
tokenizer.decode(outputs[0])

In [None]:
for desc in df["body"]:
    if len(desc.split()) > 200:
        print(summarizert5(desc))

## Question answering

Can we treat the whole of the descriptions (and titles?) as the context for a question answering task?

https://huggingface.co/transformers/task_summary.html#extractive-question-answering
https://huggingface.co/transformers/main_classes/pipelines.html#questionansweringpipeline

In [None]:
all_descriptions = " ".join(df["body"])
len(all_descriptions.split())

In [None]:
nlp = pipeline("question-answering")

In [None]:
result = nlp(question="python?", context=all_descriptions)
print(f"Answer: {result['answer']}")

In [None]:
result = nlp(question="Do you teach Python", context=all_descriptions)
print(f"Answer: {result['answer']}")