In [1]:
from experiments import utils
from bs4 import BeautifulSoup
import requests
import trafilatura
from trafilatura import fetch_url, extract
from trafilatura.settings import use_config

In [2]:
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}

### testing links
tab_links = ['https://cs.illinois.edu/', 
             'https://cs.illinois.edu/about/people/department-faculty', 
             'https://cs.illinois.edu/academics/undergraduate/degree-program-options'
             'https://cs.illinois.edu/academics/courses',
             'https://cs.illinois.edu/research'
            ]
text_links = ['https://cs.illinois.edu/research/areas/artificial-intelligence', 
              'https://cs.illinois.edu/research/areas/data-and-information-systems',
              'https://cs.illinois.edu/academics/undergraduate/registration',
              'https://cs.illinois.edu/student-life/student-organizations',
              'https://cs.illinois.edu/academics/graduate',
              'http://catalog.illinois.edu/courses-of-instruction/cs/'
             ]

### Previous methods

In [3]:
test = 'https://cs.illinois.edu/research/areas/artificial-intelligence'
utils.extract_paragraphs(test) # <p> tags only

['The study of systems that behave intelligently, artificial intelligence includes several key areas where our faculty\xa0are recognized leaders: computer vision, machine listening, natural language processing, machine learning and robotics.',
 'The AI group at Illinois is strong, diverse, and growing. It combines expertise in core strengths with promising new research directions.',
 'In machine learning, AI group faculty are studying theoretical foundations of deep and reinforcement learning; developing novel models and algorithms for deep neural networks, federated and distributed learning; as well as investigating issues related to scalability, security, privacy, and fairness of learning systems. Computer vision faculty are developing novel approaches for 2D and 3D scene understanding from still images and video; joint understanding of images and language; low-shot learning (recognition of rare or previously unseen categories); transfer learning and domain adaptation (adapting pre-t

In [5]:
utils.extract_paragraphs_trafilatura(test) # trafilatura baseline function



In [6]:
downloaded = fetch_url(test)
extract(downloaded) # trafilatura



In [7]:
extract(downloaded, favor_precision=True, deduplicate=True)



### Paragraphs + Lists + Clustering

In [8]:
page = requests.get(test, headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
paragraphs = []
for p in soup.find_all('p'):
    text = p.get_text(strip=True, separator='\n')
    if '.' in text:
        paragraphs.append(text)

In [9]:
elements = []
uls = soup.find_all('ul')
for ul in uls:
    for li in ul.findAll('li'):
        text = li.get_text(strip=True, separator='\n')
        arr = text.split('\n')
        for a in arr:
            if '.' in a:
                elements.append(a)

In [11]:
total = paragraphs + elements
''.join(total)



In [12]:
total = list(set(total))
total

['AI group research has led to a number of startups. Derek Hoiem is co-founder and Chief Science Officer of Reconstruct, which visually documents construction sites, matching images to plans and analyzing productivity and risk for delay. Girish Chowdhary \xa0is co-founder and CTO of EarthSense, a startup creating machine learning and robotics solutions for agriculture, whose work was featured in a 2020 New York Times article. David Forsyth advises a number of startups focusing on augmented reality and image synthesis, including Lightform, Revery, and Depix.',
 'academic@cs.illinois.edu',
 'undergrad@cs.illinois.edu',
 'My.CS',
 'B.S. in Mathematics & Computer Science',
 'The AI group at Illinois is strong, diverse, and growing. It combines expertise in core strengths with promising new research directions.',
 ': brings prominent leaders and experts to campus to share their ideas and promote conversations about important challenges and topics in the discipline.',
 'Guidelines for Formin

### Keep html format as if copy pasting

In [3]:
utils.extract_all_text(text_links[0])



### Clustering

In [51]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [52]:
total = paragraphs + elements
total = list(set(total))

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(total)

In [53]:
k = 4
kmeans = KMeans(n_clusters=k, n_init=5, random_state=42)
kmeans.fit(tfidf_matrix)

In [54]:
silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)
silhouette_avg

0.06621329110330104

In [56]:
clusters = {}
for cluster_num in range(k):
    clusters[cluster_num] = []
    cluster_texts = [total[i] for i in range(len(total)) if kmeans.labels_[i] == cluster_num]
    for t in cluster_texts:
        clusters[cluster_num].append(t)

In [57]:
for k in clusters.keys():
    print(f'Cluster {k}:')
    print(clusters[k])
    print()

Cluster 0:
['- An ISUR affiliated program to support women undergraduates in research and teaching in science, mathematics, and engineering. Eight scholars are selected and funded each year.', 'Donald B. Gillies Memorial Lecture', 'Your path begins here.', '3038 Campus Instructional Facility, 1405 W. Springfield Avenue, Urbana, IL', '100 Materials Science and Engineering Building, 1304 W. Green Street', '- An ISUR-affliated program which offers undergraduate women funding for a 10 week summer research program. The program uses a learning-by-apprenticeship model for high impact work in computer science, aerospace, engineering, physics, or astronomy.', 'CHBE 565 Seminar, Prof. Matthew Gebbie, University of Wisconsin-Madison (host: Kenis), "Exploring How Ionic Correlations Influence Ion Transport and Electron Transfer in Electrochemical Systems"', '- An IBM-ILLINOIS and ISUR partnership which funds undergraduate research on AI and cognitive computing, from theory to practical application.

### Comparing clustering with different scraping methods

In [82]:
def cluster_texts(lst, k=4):
    lst = list(set(lst))
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(lst)
    kmeans = KMeans(n_clusters=k, n_init=5, random_state=42)
    kmeans.fit(tfidf_matrix)
    silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)

    clusters = {}
    for cluster_num in range(k):
        clusters[cluster_num] = []
        cluster_texts = [lst[i] for i in range(len(lst)) if kmeans.labels_[i] == cluster_num]
        for t in cluster_texts:
            clusters[cluster_num].append(t)
    
    return clusters, silhouette_avg

In [83]:
text = utils.extract_paragraphs_lists(test)
clusters, score = cluster_texts(text)
print(score)
for k in clusters.keys():
    print(f'Cluster {k+1}:\n {clusters[k]}\n')

0.06621329110330104
Cluster 1:
 ['- An ISUR affiliated program to support women undergraduates in research and teaching in science, mathematics, and engineering. Eight scholars are selected and funded each year.', 'Donald B. Gillies Memorial Lecture', 'Your path begins here.', '3038 Campus Instructional Facility, 1405 W. Springfield Avenue, Urbana, IL', '100 Materials Science and Engineering Building, 1304 W. Green Street', '- An ISUR-affliated program which offers undergraduate women funding for a 10 week summer research program. The program uses a learning-by-apprenticeship model for high impact work in computer science, aerospace, engineering, physics, or astronomy.', 'CHBE 565 Seminar, Prof. Matthew Gebbie, University of Wisconsin-Madison (host: Kenis), "Exploring How Ionic Correlations Influence Ion Transport and Electron Transfer in Electrochemical Systems"', '- An IBM-ILLINOIS and ISUR partnership which funds undergraduate research on AI and cognitive computing, from theory to p

In [84]:
text = utils.extract_paragraphs(test) # <p> tags only
clusters, score = cluster_texts(text)
print(score)
for k in clusters.keys():
    print(f'Cluster {k+1}:\n {clusters[k]}\n')

0.030369897018214645
Cluster 1:
 ['CHBE 565 Seminar, Prof. Matthew Gebbie, University of Wisconsin-Madison (host: Kenis), "Exploring How Ionic Correlations Influence Ion Transport and Electron Transfer in Electrochemical Systems"']

Cluster 2:
 ['Thomas M. Siebel Center for Computer Science', 'Undergraduates at Illinois Computer Science are an important part of our world-renowned research. From summer programs to paid research positions with faculty, there are multiple ways for our students to contribute to high impact research early in their careers.']

Cluster 3:
 ['3038 Campus Instructional Facility, 1405 W. Springfield Avenue, Urbana, IL']

Cluster 4:
 ['100 Materials Science and Engineering Building, 1304 W. Green Street', 'Lectures and discussions on current work in research and development in nuclear engineering and related fields by staff, advanced students, and visiting speakers.']



In [86]:
text = utils.extract_paragraphs_trafilatura(test) # trafilatura baseline function
clusters, score = cluster_texts(text.split('.'))
print(score)
for k in clusters.keys():
    print(f'Cluster {k+1}:\n {clusters[k]}\n')

0.053204351475669445
Cluster 1:
 [' Matthew Gebbie, University of Wisconsin-Madison (host: Kenis), "Exploring How Ionic Correlations Influence Ion Transport and Electron Transfer in Electrochemical Systems" 116 Roger Adams Laboratory Hard Materials Seminar - "Isolating the Effects of Thixotropy in Geopolymer Pastes" Ally Brandvold (Kriven) 100 Materials Science and Engineering Building, 1304 W', ' No events found Physics Colloquium: "Efficient Programmable Quantum Simulation of Correlated Bosons and Lattice Gauge Theories" Steven Girvin (Yale) Loomis Lab 141 and via Zoom NPRE Special Seminar - Bo Feng Lectures and discussions on current work in research and development in nuclear engineering and related fields by staff, advanced students, and visiting speakers', ' Springfield Avenue, Urbana, IL SE 290 - Rob Cotner 151 Loomis Mathematical and Theoretical Physics Seminar: Aspect of Symmetry and Branes in Holography Ibrahima Bah, Johns Hopkins University Loomis Room 464 CHBE 565 Seminar, 

In [87]:
text = extract(downloaded, favor_precision=True, deduplicate=True) # trafilatura
clusters, score = cluster_texts(text.split('.'))
print(score)
for k in clusters.keys():
    print(f'Cluster {k+1}:\n {clusters[k]}\n')

0.014361945275324587
Cluster 1:
 [' Springfield Avenue, Urbana, IL\nIbrahima Bah, Johns Hopkins University\nLoomis Room 464']

Cluster 2:
 ['\nNov 1, 2023 - Nov 1, 2024\nNo events found\nGrainger Engineering Seminars and Speakers\nNov 1, 2023 - Nov 1, 2024\nPhysics Colloquium: "Efficient Programmable Quantum Simulation of Correlated Bosons and Lattice Gauge Theories"\nSteven Girvin (Yale)\nLoomis Lab 141 and via Zoom\nLectures and discussions on current work in research and development in nuclear engineering and related fields by staff, advanced students, and visiting speakers', ' ISUR includes multiple components, from a research focused seminar to a number of paid undergraduate research programs', ' The program uses a learning-by-apprenticeship model for high impact work in computer science, aerospace, engineering, physics, or astronomy', '\nBo Feng, National Technical Director for DOE-NE Fast Reactor R&D Program; Reactor and Fuel Cycle Analysis Manager at Argonne National Laboratory

### Filter clusters

In [88]:
text = utils.extract_paragraphs_lists(test)
clusters, score = cluster_texts(text)
print(score)
for k in clusters.keys():
    print(f'Cluster {k+1}:\n {clusters[k]}\n')

0.04537374758275532
Cluster 1:
 ['- An ISUR affiliated program to support women undergraduates in research and teaching in science, mathematics, and engineering. Eight scholars are selected and funded each year.', '- A college of engineering wide program. ISUR includes multiple components, from a research focused seminar to a number of paid undergraduate research programs. ISUR programs follow an apprenticeship model, where students work closely with faculty mentors. Some distinct programs supported by\xa0ISUR include:', '- An ISUR-affliated program which offers undergraduate women funding for a 10 week summer research program. The program uses a learning-by-apprenticeship model for high impact work in computer science, aerospace, engineering, physics, or astronomy.', '- An IBM-ILLINOIS and ISUR partnership which funds undergraduate research on AI and cognitive computing, from theory to practical application. Students additionally worked with a C3SR faculty mentor.', 'Guidelines for Fo

In [97]:
avgs = {}
for k in clusters.keys():
    total = 0
    for s in clusters[k]:
        total += len(s)
    avgs[k] = (total / len(clusters[k]))

In [98]:
avgs

{0: 204.375, 1: 219.1, 2: 32.55555555555556, 3: 42.2}

In [103]:
threshold = sum(avgs.values()) / 4
threshold

124.55763888888889

In [105]:
result = {}
for k in clusters.keys():
    if avgs[k] > threshold:
        result[k] = clusters[k]

In [107]:
for k in result.keys():
    print(f'Cluster {k+1}:\n {result[k]}\n')

Cluster 1:
 ['- An ISUR affiliated program to support women undergraduates in research and teaching in science, mathematics, and engineering. Eight scholars are selected and funded each year.', '- A college of engineering wide program. ISUR includes multiple components, from a research focused seminar to a number of paid undergraduate research programs. ISUR programs follow an apprenticeship model, where students work closely with faculty mentors. Some distinct programs supported by\xa0ISUR include:', '- An ISUR-affliated program which offers undergraduate women funding for a 10 week summer research program. The program uses a learning-by-apprenticeship model for high impact work in computer science, aerospace, engineering, physics, or astronomy.', '- An IBM-ILLINOIS and ISUR partnership which funds undergraduate research on AI and cognitive computing, from theory to practical application. Students additionally worked with a C3SR faculty mentor.', 'Guidelines for Forming Ph.D. Committe