In [1]:
import utils
from bs4 import BeautifulSoup
import requests
from trafilatura import fetch_url, extract, baseline
from trafilatura.settings import use_config

In [29]:
### testing links
tab_links = ['https://cs.illinois.edu/', 
             'https://cs.illinois.edu/about/people/department-faculty', 
             'https://cs.illinois.edu/academics/undergraduate/degree-program-options'
             'https://cs.illinois.edu/academics/courses',
             'https://cs.illinois.edu/research'
            ]
text_links = ['https://cs.illinois.edu/research/areas/artificial-intelligence', 
              'https://cs.illinois.edu/research/areas/data-and-information-systems',
              'https://cs.illinois.edu/academics/undergraduate/registration',
              'https://cs.illinois.edu/student-life/student-organizations',
              'https://cs.illinois.edu/academics/graduate',
              'http://catalog.illinois.edu/courses-of-instruction/cs/'
             ]

## Webpage categorization

### By word count

In [3]:
def text_density_by_word_count(url):
    headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')
    
    #all_text = utils.extract_all_text(url)
    all_text = soup.body.get_text(' ', strip=True)
    all_text = ''.join(set(all_text.split('.')))
    paragraph_text = ' '.join(utils.extract_paragraphs(url))
    paragraph_text = ''.join(set(paragraph_text.split('.')))
    
    a = len(all_text.split())
    p = len(paragraph_text.split())
    return round(p / a, 2), a, p, all_text, paragraph_text

In [4]:
res = text_density_by_word_count(tab_links[1])

In [5]:
res[:3]

(0.01, 1612, 17)

In [6]:
res[3]

' Amato she/her/hers Abel Bliss Professor of Engineering and Department Head Lawrence Angrave he/him/his Teaching Professor, Gies RC Evans Innovation Fellow, CITL Fellow Tal August he/him/his Assistant Professor (starting August 2024) Brian P Bailey Professor Arindam  Banerjee Founder Professor in Engineering Adam Bates Associate Professor Mattox Alan Beckman Teaching Associate Professor Matthew Caesar Professor George  Chacko Research Associate Professor Geoffrey Werner Challen Teaching Associate Professor Timothy Moon-Yew Chan Founder Professor in Engineering Eshwar  Chandrasekharan he/him/his Assistant Professor Kevin Chenchuan Chang Professor Chandra Sekhar Chekuri Paul and Cynthia Saylor Professor Girish  Chowdhary Associate Professor Camille Cobb Assistant Professor Benjamin Cosman Teaching Assistant Professor Katie Cunningham Assistant Professor Ryan Matthew Cunningham Lecturer David M Hoiem Professor Reyhaneh  Jabbarvand Assistant Professor Sheldon Howard Jacobson Founder Profe

In [7]:
res[4]

' Gillies Chair in Computer Science Gillies Professor in Computer Science Donald BRichard T Cheng Professor Donald B'

In [8]:
res = text_density_by_word_count(text_links[0])
res[:3]

(0.29, 2330, 681)

In [9]:
res[3]



In [10]:
res[4]



In [11]:
tab_wc = []
for l in tab_links:
    n = text_density_by_word_count(l)
    tab_wc.append(n[0])
    print(n[:3])

(0.03, 1545, 44)
(0.01, 1612, 17)
(0.0, 107, 0)
(0.11, 1116, 127)


In [12]:
text_wc = []
for l in text_links:
    n = text_density_by_word_count(l)
    text_wc.append(n[0])
    print(n[:3])

(0.29, 2330, 681)
(0.17, 1810, 316)
(0.43, 3606, 1535)
(0.11, 1236, 140)
(0.16, 1051, 167)
(0.96, 10516, 10119)


### By average text block length

In [13]:
def get_avg_text_block_length(url):
    headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    config = use_config()
    config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
    downloaded = fetch_url(url)
    try:  
        result = extract(downloaded, config=config, output_format='xml', include_links=True, include_formatting=True)
    except:
        return 0
    if result is None:
        return 0

    soup = BeautifulSoup(result, 'lxml')
    lengths = []
    for p in soup.find_all('p'):
        text = p.get_text(strip=True, separator='\n')
        if text:
            lengths.append(len(text.split()))
                           
    return sum(lengths) / len(lengths)

In [14]:
tab_avg = []
for l in tab_links:
    n = get_avg_text_block_length(l)
    tab_avg.append(n)
    print(n)

4.538461538461538
2.9625
0
18.25


In [15]:
text_avg = []
for l in text_links:
    n = get_avg_text_block_length(l)
    text_avg.append(n)
    print(n)

17.875
17.157894736842106
44.10526315789474
29.0
16.09090909090909
39.71134020618557


### Multiple regression model

In [16]:
import pandas as pd

In [17]:
data = []
for i in range(len(tab_avg)):
    row = [tab_wc[i], tab_avg[i], 'tabular']
    data.append(row)

for i in range(len(text_avg)):
    row = [text_wc[i], text_avg[i], 'textual']
    data.append(row)

In [18]:
df = pd.DataFrame(data, columns=['density_wc', 'density_avg', 'label'])
df

Unnamed: 0,density_wc,density_avg,label
0,0.03,4.538462,tabular
1,0.01,2.9625,tabular
2,0.0,0.0,tabular
3,0.11,18.25,tabular
4,0.29,17.875,textual
5,0.17,17.157895,textual
6,0.43,44.105263,textual
7,0.11,29.0,textual
8,0.16,16.090909,textual
9,0.96,39.71134,textual


### HTML

In [19]:
utils.extract_all_text(tab_links[1])

'                  Alumni  Corporate  People  My.CS         University of Illinois at Urbana-Champaign   The Grainger College of Engineering  Computer Science          Search         Menu             Search        About    About  Rankings & Statistics  Contact Us & Office Locations  History Timeline  Accreditation  Values & Code of Conduct  CS CARES Committee  Contact CS CARES  Governance  Members  Resources    People  All Faculty  Department Faculty  Affiliate Faculty  Adjunct Faculty  Emeritus Faculty  Postdoctoral Researchers  Staff  Office of the Department Head  Communications & Engagement Team  Undergraduate Advising Office  Graduate Advising Office  Instructional Development Team  Business Office  Faculty Support Contacts  Facilities, Shipping and Receiving    Graduating PhD Students    Open Positions  Faculty Positions  Postdoctoral Positions  Future Faculty Fellows    Staff Positions  Choose Illinois Computer Science    Awards  2021 Celebration of Excellence  2022 Celebration 