In [1]:
from datetime import date
import pandas as pd
import re
import requests

## Pull data

In [2]:
url = "https://www.lib.ncsu.edu/api/workshops/all"

In [3]:
r = requests.get(url)
r

<Response [200]>

In [4]:
data = r.json()

In [5]:
data[0]

{'title': 'IRB Basics: eIRB Application Workshop',
 'nid': '57007',
 'field_time_d8': '01-20-2021 10:00AM to 01-20-2021 1:00PM',
 'body': '<p><span><span>This workshop provides an overview of how to write your eIRB application and is designed for first-time IRB application filers and NC State students who want to participate in the </span><a href="https://research.ncsu.edu/administration/compliance/research-compliance/irb/irb-for-researchers/priority-review-process/">Priority Review Process</a><span> (PRP) program. Faculty and staff are welcome to attend this workshop provided that the prerequisites are completed; participation in the PRP program is, however, limited to students only at this time. All workshop attendees must complete the below online training, which has no cost to NC State University affiliates with valid Unity IDs and passwords. To access CITI training, please consult the </span><a href="https://drive.google.com/file/d/15On0UevTEaZz9cXk9pVPdSstZPFgjnbO/view">CITI trai

In [6]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,nid,field_time_d8,body,field_registration_url,field_non_library_instructor,field_workshop_leads_export,field_workshop_series,field_workshop_user_activities,field_non_libraries_space_1,field_space
0,IRB Basics: eIRB Application Workshop,57007,01-20-2021 10:00AM to 01-20-2021 1:00PM,<p><span><span>This workshop provides an overv...,"<a href=""https://reporter.ncsu.edu/link/instan...",\nYael Allen\n\n,[],Research Strategy,,Online only,
1,Campus Conversations Project: Inauguration Day...,57129,01-20-2021 7:00PM to 01-20-2021 8:30PM,<p>Trained student volunteers from the <a href...,"<a href=""https://reporter.ncsu.edu/link/instan...",\nMorgan DiCarlo\n\n,"[{'id': '279', 'url': 'https://www.lib.ncsu.ed...",,,Online only,
2,IRB Basics Part II: Handling Your Data and Ass...,57017,01-21-2021 1:00PM to 01-21-2021 4:00PM,<p><span>This workshop builds upon the IRB Bas...,"<a href=""https://reporter.ncsu.edu/link/instan...",\nYael Allen\n\n,[],,,Online only,
3,IRB Topical Workshop: IRB Process and IT Profe...,57103,01-21-2021 3:00PM to 01-21-2021 5:00PM,<p>This session is specifically for IT profess...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,
4,IRB Topical Workshop: Using “Apps” and other S...,56995,01-25-2021 10:00AM to 01-25-2021 12:00PM,<p>This session will discuss the use of “off t...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,


Strip white spaces from the `field_non_library_instructor` column. 

In [7]:
df["field_non_library_instructor"] = df["field_non_library_instructor"].str.strip()
df.head()

Unnamed: 0,title,nid,field_time_d8,body,field_registration_url,field_non_library_instructor,field_workshop_leads_export,field_workshop_series,field_workshop_user_activities,field_non_libraries_space_1,field_space
0,IRB Basics: eIRB Application Workshop,57007,01-20-2021 10:00AM to 01-20-2021 1:00PM,<p><span><span>This workshop provides an overv...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],Research Strategy,,Online only,
1,Campus Conversations Project: Inauguration Day...,57129,01-20-2021 7:00PM to 01-20-2021 8:30PM,<p>Trained student volunteers from the <a href...,"<a href=""https://reporter.ncsu.edu/link/instan...",Morgan DiCarlo,"[{'id': '279', 'url': 'https://www.lib.ncsu.ed...",,,Online only,
2,IRB Basics Part II: Handling Your Data and Ass...,57017,01-21-2021 1:00PM to 01-21-2021 4:00PM,<p><span>This workshop builds upon the IRB Bas...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],,,Online only,
3,IRB Topical Workshop: IRB Process and IT Profe...,57103,01-21-2021 3:00PM to 01-21-2021 5:00PM,<p>This session is specifically for IT profess...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,
4,IRB Topical Workshop: Using “Apps” and other S...,56995,01-25-2021 10:00AM to 01-25-2021 12:00PM,<p>This session will discuss the use of “off t...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,


Remove HTML tags from `body`.

In [8]:
def strip_html_tags(text: str) -> str:
    tag = re.compile("<.*?>")
    return re.sub(tag, "", text)


In [9]:
df["body"] = df["body"].apply(strip_html_tags)
df.head()

Unnamed: 0,title,nid,field_time_d8,body,field_registration_url,field_non_library_instructor,field_workshop_leads_export,field_workshop_series,field_workshop_user_activities,field_non_libraries_space_1,field_space
0,IRB Basics: eIRB Application Workshop,57007,01-20-2021 10:00AM to 01-20-2021 1:00PM,This workshop provides an overview of how to w...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],Research Strategy,,Online only,
1,Campus Conversations Project: Inauguration Day...,57129,01-20-2021 7:00PM to 01-20-2021 8:30PM,Trained student volunteers from the Campus Con...,"<a href=""https://reporter.ncsu.edu/link/instan...",Morgan DiCarlo,"[{'id': '279', 'url': 'https://www.lib.ncsu.ed...",,,Online only,
2,IRB Basics Part II: Handling Your Data and Ass...,57017,01-21-2021 1:00PM to 01-21-2021 4:00PM,This workshop builds upon the IRB Basics: Part...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],,,Online only,
3,IRB Topical Workshop: IRB Process and IT Profe...,57103,01-21-2021 3:00PM to 01-21-2021 5:00PM,This session is specifically for IT profession...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,
4,IRB Topical Workshop: Using “Apps” and other S...,56995,01-25-2021 10:00AM to 01-25-2021 12:00PM,This session will discuss the use of “off the ...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,


Write to csv, with date.

In [10]:
df.to_csv(f"current-workshops-{date.today()}.csv")

## Explore text

TODO:
- Consistency across workshops
    - Length of description
    - Consistent keywords?
    - Does title reflect body paragraph? (check keywords)
        - Does body contain words from the title? Does title contain words from body?
    - Consistency of workshop descriptions across related titles
        - If we have multiple introductions to python, are the descriptions similar?
        - Similar across intro to R vs intro python? 
- Lexical variety?


In [62]:
from nltk import word_tokenize
import numpy as np
import spacy

Read in the most current csv of workshop data. We'll hardcode it for now, but could always parse the filenames, and get the most recent. 

In [55]:
df = pd.read_csv("current-workshops-2021-01-20.csv")

In [56]:
def naive_count_words(text: str) -> int:
    return len([word for word in word_tokenize(text)
        if word not in [".", ",", "?", "'", "\"", ":", ";"]])

In [57]:
naive_count_words(df["body"][0])

296

We've done a naive pass to check word count minus punctuation. Let's compare it to spaCy's parsing, removing punctuation and spaces/newlines.

In [50]:
nlp = spacy.load("en_core_web_lg")

In [58]:
def spacy_count_words(text: str) -> int:
    doc = nlp(text)
    no_punct = [t for t in doc if t.is_punct == False]
    no_space = [t for t in no_punct if t.is_space == False]
    return len(no_space) 

In [59]:
spacy_count_words(df["body"][0])

289

Create a new column in the dataframe that contains number of words in the description(`body`).

In [60]:
df["body_word_count"] = df["body"].apply(spacy_count_words)

In [61]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,nid,field_time_d8,body,field_registration_url,field_non_library_instructor,field_workshop_leads_export,field_workshop_series,field_workshop_user_activities,field_non_libraries_space_1,field_space,body_word_count
0,0,IRB Basics: eIRB Application Workshop,57007,01-20-2021 10:00AM to 01-20-2021 1:00PM,This workshop provides an overview of how to w...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],Research Strategy,,Online only,,289
1,1,Campus Conversations Project: Inauguration Day...,57129,01-20-2021 7:00PM to 01-20-2021 8:30PM,Trained student volunteers from the Campus Con...,"<a href=""https://reporter.ncsu.edu/link/instan...",Morgan DiCarlo,"[{'id': '279', 'url': 'https://www.lib.ncsu.ed...",,,Online only,,115
2,2,IRB Basics Part II: Handling Your Data and Ass...,57017,01-21-2021 1:00PM to 01-21-2021 4:00PM,This workshop builds upon the IRB Basics: Part...,"<a href=""https://reporter.ncsu.edu/link/instan...",Yael Allen,[],,,Online only,,71
3,3,IRB Topical Workshop: IRB Process and IT Profe...,57103,01-21-2021 3:00PM to 01-21-2021 5:00PM,This session is specifically for IT profession...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,,144
4,4,IRB Topical Workshop: Using “Apps” and other S...,56995,01-25-2021 10:00AM to 01-25-2021 12:00PM,This session will discuss the use of “off the ...,"<a href=""https://reporter.ncsu.edu/link/instan...",,[],,Grants &amp; Funding,Online only,,172


In [63]:
avg_word_count = np.mean(df["body_word_count"])
avg_word_count

123.44318181818181

In [65]:
lowest_word_count = df["body_word_count"].min()
lowest_word_count

36

In [67]:
highest_word_count = df["body_word_count"].max()
highest_word_count

289