In [None]:
import pycld2 as cld2
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import re
from tqdm import tqdm
import time
import pickle
import glob

import signal

import pandas as pd

GLOBAL_FAILED_URLS = []

In [None]:
def extract_number(filename):
    match = re.search(r'\d+', filename)
    if match:
        return int(match.group())
    return 0

def read_in_last_store():
    files = glob.glob('pages_*')
    if len(files) == 0:
        return pd.DataFrame({
            'url': [], 'scrape_data': [], 'nces_id': []
            })
    sorted_files = sorted(files, key=extract_number)
    latest_file = sorted_files[-1]
    return pd.read_csv(latest_file)

In [None]:
def get_urls():
    all_districts = pd.read_csv("elsi-district-2021-2022.csv", skiprows = 6)
    #all_districts = pd.read_csv("elsi-school-2021-2022.csv", skiprows = 6)
    urls = all_districts['Web Site URL [District] 2021-22']
    #urls = all_districts['Web Site URL [Public School] 2021-22']
    urls = pd.unique(urls)
    urls = [u for u in urls if not pd.isna(u) and 'http' in u]
    return urls

def get_url_map():
    urls = get_urls()
    #all_districts = pd.read_csv("elsi-district-2021-2022.csv", skiprows = 6)
    all_districts = pd.read_csv("elsi-school-2021-2022.csv", skiprows = 6)
    all_districts[[
        #'Agency ID - NCES Assigned [District] Latest available year',
        'School ID - NCES Assigned [Public School] Latest available year',
        #'Web Site URL [District] 2021-22'
        'Web Site URL [Public School] 2021-22'
    ]]
    district_dict = {}
    for _, row in all_districts.iterrows():
        #url = row['Web Site URL [District] 2021-22']
        url = row['Web Site URL [Public School] 2021-22']
        #district_dict[url] = row['Agency ID - NCES Assigned [District] Latest available year']
        district_dict[url] = row['School ID - NCES Assigned [Public School] Latest available year']
    return district_dict

def send_request(url, retries=1, delay=1):
    def timeout_handler(signum, frame):
        return None
    signal.signal(signal.SIGALRM, timeout_handler)
    # Set the alarm for 10 seconds
    signal.alarm(10)
    try:
        if url in GLOBAL_FAILED_URLS:
            return None
        for attempt in range(retries):
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'lxml')
                return soup
            except requests.exceptions.RequestException as e:
                print(f"Request to {url} failed. Retrying... ({attempt+1}/{retries})")
                time.sleep(delay)
        print("Max retries exceeded. Request failed.")
        GLOBAL_FAILED_URLS.append(url)
        return None 
    finally:
        signal.alarm(0)

def scrape_data_with_subpages(url):
    soup_main = send_request(url)
    soup_subs = []
    if soup_main:
        subpage_links = soup_main.find_all('a', href=True)
        for link in subpage_links[:10]:
            subpage_url = link['href']
            if subpage_url.startswith('#'):
                continue
            if subpage_url.startswith('tel:'):
                continue
            if subpage_url.startswith('mailto'):
                continue
            if subpage_url.startswith('/'):
                subpage_url = url + subpage_url[1:]  # Construct full URL
            soup_subs.append(send_request(subpage_url))
    res = {'soup_main': soup_main, 'soup_subs': soup_subs}
    return res

def scrape_data_without_subpages(url):
    soup_main = send_request(url)
    res = {'soup_main': soup_main}
    return res

def save_progress(i='all'):
    data = dict()
    dmap = get_url_map()
    durls, ddat, dnces = [], [], []
    dfs = []
    for url, dat in tqdm(zip(processed_urls, responses)):
        durls.append(url)
        ddat.append(dat),
        dnces.append(dmap[url])
    ddd = pd.DataFrame({
        'url': durls,
        'scrape_data': [d.get('soup_main') if isinstance(d, dict) else d for d in ddat],
        'nces_id': dnces
    })
    ddd.to_csv(f'pages_{i}.csv', index=False)
    return

## Scraping of subpages (optional)

In [None]:
csv_file_path = 'tmp.csv' # Add a sample of webpages (columns: Name {str}) to scrape subpages

df = pd.read_csv(csv_file_path)

print(df)

In [None]:
responses = []
processed_urls = []

In [None]:
for i, u in tqdm(enumerate(df.Name.values, 1)):
    try:
        responses.append(scrape_data_with_subpages(u))
    except:
        responses.append('SCRAPING FAILED')
    processed_urls.append(u)
    if i%1000 == 0:
        save_progress(i)

In [None]:
DL_TERMS = set([
'dual language',
'dl',
'dual-language',
'two-way',
'duallanguage',
'twoway',
'two way',
'language immersion'
])

In [None]:
# Tag dual language mentions
def has_dl_string_list(tokens):
    try:
        tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
        unigrams = set(tokens)
        bigrams = set([tokens[i]+' '+tokens[i+1] for i in range(len(tokens) - 1)])
        searchspace = unigrams | bigrams
        return len(DL_TERMS&searchspace)>0
    except:
        return np.nan

In [None]:
import ast
import numpy as np

In [None]:
import bs4

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from langdetect import detect
import tldextract
import bs4
import string

In [None]:
def process_subpage_result(response):
    ans = []
    try:
        ans.append(has_dl_string_list(extract_words(response['soup_main'])))
    except TypeError:
        pass
    try:
        if response.get('soup_subs') is not None:
            try:
                for element in response['soup_subs']:
                    ans.append(has_dl_string_list(extract_words(element)))
            except TypeError:
                pass
    except:
        pass
    return sum(ans)>0

In [None]:
data = dict()
dmap = get_url_map()
durls, ddat, dnces = [], [], []
dfs = []
for url, dat in tqdm(zip(processed_urls, responses)):
    durls.append(url)
    ddat.append(dat),
    dnces.append(dmap[url])
ddd = pd.DataFrame({
    'url': durls,
    'scrape_data': [d.get('soup_main') if isinstance(d, dict) else d for d in ddat],
    'nces_id': dnces
})

## Postprocessing (optional)

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from langdetect import detect
import tldextract
import bs4

In [None]:
# Function to strip parameters and standardize link format
def standardize_links(text):
    if isinstance(text, bs4.BeautifulSoup):
        soup = text
    else:
        soup = BeautifulSoup(text, 'html.parser')
    links = soup.find_all('a')
    standardized_links = []

    for link in links:
        url = link.get('href')
        if url:
            parsed_url = tldextract.extract(url)
            standardized_link = f"{parsed_url.domain}.{parsed_url.suffix}"
            standardized_links.append(standardized_link)

    return standardized_links

# Function to extract words from HTML text elements
def extract_words(text, remove_sw=True):
    if isinstance(text, bs4.BeautifulSoup):
        soup = text
    else:
        soup = BeautifulSoup(text, 'html.parser')
    text_elements = soup.find_all(text=True)
    words = []

    for element in text_elements:
        if element.parent.name not in ['style', 'script', 'head', 'title']:
            content = element.strip()
            # Remove non-words (such as links)
            content = re.sub(r'https?://\S+', '', content)
            # Split into words and convert to lowercase
            words.extend(content.lower().split())

    if remove_sw:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word.lower() not in stop_words]

    return words

# Function to detect social media platforms
def detect_social_media(text):
    if isinstance(text, bs4.BeautifulSoup):
        soup = text
    else:
        soup = BeautifulSoup(text, 'html.parser')
    social_media_results = []

    # Add social media platform names and patterns here
    social_media_patterns = [
        (r"(?:http|https):\/\/(?:www\.)?twitter\.com\/([a-zA-Z0-9_]+)", "Twitter"),
        (r"(?:http|https):\/\/(?:www\.)?facebook\.com\/([a-zA-Z0-9.]+)", "Facebook"),
        (r"(?:http|https):\/\/(?:www\.)?instagram\.com\/([a-zA-Z0-9_]+)", "Instagram"),
        (r"(?:http|https):\/\/(?:www\.)?linkedin\.com\/([a-zA-Z0-9\-]+)", "LinkedIn")
    ]

    for pattern, platform in social_media_patterns:
        matches = re.findall(pattern, str(soup))
        for match in matches:
            social_media_results.append((platform, match))

    return social_media_results

# Example usage
def get_langs(s):
    s = " ".join(extract_words(s, remove_sw=False))
    ans = cld2.detect(
        s, returnVectors=True
    )
    return ans

def process_html_string(html_string):
    
    if isinstance(html_string, list):
        html_string = html_string[0]

    # Apply the functions to the input HTML string
    try:
        standardized_links = standardize_links(html_string)
    except:
        standardized_links = []
    try:
        words = extract_words(html_string)
    except:
        words = []
    try:
        social_media = detect_social_media(html_string)
    except:
        social_media = []
    try:
        langs = get_langs(html_string)
    except:
        langs = ''

    # Create and return the dictionary of results
    results = {
        "standardized_links": standardized_links,
        "words_without_stopwords": words,
        "social_media_platforms": social_media,
        "langs": langs
    }

    return results

In [None]:
tqdm.pandas()

tmp = ddd['scrape_data'].progress_map(process_html_string)

In [None]:
# Extract unique keys from all dictionaries
all_keys = set().union(*(d.keys() for d in tmp))

# Create a dictionary with empty lists for each key
data_dict = {key: [] for key in all_keys}

# Fill the dictionary with data from the list of dictionaries
for data in tmp:
    for key in all_keys:
        data_dict[key].append(data.get(key))

# Convert the dictionary to a DataFrame
df = pd.DataFrame(data_dict)

In [None]:
out = pd.concat([ddd, df], axis=1)

In [None]:
out.to_csv('1-scraped-data-final-schools.csv', index=False)