In [6]:
!pip install fake_useragent 
!pip install openai

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/19/92/d0fe64512dba89aa68f4eb432fb9d7b2d2a471a011dfea389d9d42e000d2/openai-1.33.0-py3-none-any.whl.metadata
  Downloading openai-1.33.0-py3-none-any.whl.metadata (21 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Downloading openai-1.33.0-py3-none-any.whl (325 kB)
   ---------------------------------------- 0.0/325.5 kB ? eta -:--:--
   --- ------------------------------------ 30.7/325.5 kB 1.3 MB/s eta 0:00:01
   --- ------------------------------------ 30.7/325.



In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from fake_useragent import UserAgent
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
combined_df = pd.DataFrame()
ua = UserAgent()

def fetch_data(url, max_retries=3):
    retries = 0
    session = requests.Session()
    headers = {
        'User-Agent': ua.chrome,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0'
    }

    while retries < max_retries:
        try:
            response = session.get(url, headers=headers, timeout=10)
            if response.status_code == 403:
                print(f"Access denied: {response.status_code} for {url}")
            return response
        except requests.RequestException as e:
            print(f"Error: {e}")
            retries += 1
            time.sleep(1)  # Wait for a second before retrying

    print(f"Failed to fetch data from {url}")
    return None

def extract_table_data(url):
    response = fetch_data(url)
    if response is None or not response.ok:
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table")
    table_data = []

    for row in table.find_all("tr"):
        row_data = [cell.text.strip() for cell in row.find_all(["td", "th"])]
        row_links = [urljoin(url, cell.find("a")["href"]) if cell.find("a") else None for cell in row.find_all(["td", "th"])]
        table_data.append(row_data + row_links)

    df = pd.DataFrame(table_data)
    return df

def fetch_description(link):
    response = fetch_data(link)
    if response is not None and response.ok:
        desc_soup = BeautifulSoup(response.text, "html.parser")
        return desc_soup.get_text().strip()
    return ""

def process_urls(urls):
    combined_df = pd.DataFrame()

    for url in urls:
        df = extract_table_data(url)
        #print(df)
        if not df.empty:
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)

            df['Description'] = df['Scholarship Title'].apply(lambda title: fetch_description(title) if 'http' in title else "")

            combined_df = pd.concat([combined_df, df], ignore_index=True)
            #print(combined_df)

    return combined_df

def clean_and_export(combined_df):
    combined_df.drop_duplicates(subset='Scholarship Title', inplace=True)
    
    combined_df[['Amount1', 'Amount2']] = combined_df['Amount'].str.split(':', n=1, expand=True)
    combined_df['Amount'] = combined_df['Amount2']
    
    combined_df.drop(columns=['Amount1', 'Amount2'], inplace=True)
    
    combined_df[['Due Date1', 'Due Date2']] = combined_df['Due Date'].str.split(':', n=1, expand=True)
    combined_df['Due Date'] = combined_df['Due Date2']
    combined_df.drop(columns=['Due Date1', 'Due Date2'], inplace=True)
    combined_df['Amount'] =  combined_df['Amount'].str.replace(' ', '')
    combined_df['Due Date'] =  combined_df['Due Date'].str.replace(' ', '')
    combined_df.replace({'\r': '', '\n': ''}, regex=True, inplace=True)
    print(combined_df.columns)
    keep_columns = combined_df.columns.drop(combined_df.columns[4])
    combined_df = combined_df[keep_columns]
    combined_df.to_csv("scholarships_data.csv", index=False)
    
    return combined_df

if __name__ == "__main__":
    # Example usage:
    urls = [
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/accounting',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/actuarial-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/advertising-and-public-relations',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/aerospace-technologies-and-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/agriculture-agribusiness',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/agronomy-and-soils',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/aircraft-maintenance',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/airline-flight-attendant',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/animal-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/anthropology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/applied-science-and-technology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/archaeology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/architecture',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/art',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/art-history',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/asian-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/astrophysics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/atmospheric-and-oceanic-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/audiology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/automotive',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/aviation',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/biblical-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/biochemistry',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/biology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/botany',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/broadcasting-visual-and-interactive-media',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/business',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/business-administration',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/business-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/carpentry',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/cartography-and-geographic-information-systems',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/chemical-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/chemistry',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/child-and-adolescent-development',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/chiropractic',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/civil-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/communication-networks-and-security',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/communications',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/comparative-literature',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/computer-aided-drafting-and-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/computer-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/computer-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/construction-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/cosmetology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/creative-writing',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/criminal-justice',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/culinary-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/cybersecurity',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/dance',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/dental-hygiene',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/dentistry',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/earth-sciences-and-natural-resources',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/ecology-and-evolutionary-biology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/economics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/education',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/electrical-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/electronics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/emergency-health-services-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/english',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/entomology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/entrepreneurship',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/environmental-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/environmental-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/equine-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/family-and-consumer-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/fashion-and-retail-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/film-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/film-television-and-interactive-media',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/finance',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/fine-arts',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/fire-protection-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/firefighting',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/food-science-and-human-nutrition',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/foreign-languages-cultures',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/forensic-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/forestry-fisheries-and-wildlife',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/funeral-services-mortuary',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/game-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/genetics-genomics-and-development',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/geography-area-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/geology-and-geophysics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/government',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/graphic-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/health-care-administration',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/health-education-and-promotion',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/history',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/horticulture',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/hotel-and-restaurant-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/human-resources',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/human-services',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/humanities',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/hvac',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/information-systems',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/insurance',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/interior-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/international-affairs',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/international-business',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/jewish-studies-judaism',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/journalism-and-public-relations',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/kinesiology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/labor-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/landscape-architecture',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/law-school-legal-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/liberal-arts',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/library-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/lighting-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/linguistics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/logistics-supply-chain-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/marine-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/marketing',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/materials-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/mathematics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/mechanical-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/medical-technology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/medicine',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/microbiology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/molecular-and-cell-biology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/museum-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/music',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/native-american-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/neuroscience-and-behavioral-biology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/new-media',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/nuclear-science-and-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/nursing-nurse-practitioner',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/nutrition',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/occupational-therapy',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/oncology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/optometry-ophthalmology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/orthotics-prosthetics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/osteopathic',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/paralegal',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/paramedic-emt',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/park-and-recreation-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/pediatrics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/pharmaceutical-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/philosophy',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/photography',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/physical-education-sport-and-physical-activity',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/physical-therapy-rehabilitation',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/physician-associate',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/physics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/physiology-and-neurobiology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/plumbing',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/police-law-enforcement',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/political-science',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/polymer-and-fiber-engineering',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/project-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/psychiatry',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/psychology-counseling',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/public-health',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/public-policy',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/radiologic-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/real-estate',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/religious-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/science-general',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/science-health',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/science-social',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/social-work',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/sociology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/spanish',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/special-education',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/speech-language-and-hearing-sciences',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/sport-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/sports-medicine',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/statistics',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/theater-and-performance-studies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/theology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/tourism-management',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/transportation-technologies',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/veterinary-medicine',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/viticulture-and-enology',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/vocational-careers',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/web-design',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/welding',
        'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major/womens-and-gender-studies',
    ]

    data = process_urls(urls)
    #print(data)
    clean_and_export(data)
    
import pandas as pd
from datetime import datetime
import re
from fuzzywuzzy import fuzz, process

# Function to clean and format dates
def clean_date(date_str):
    # List of month names
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]
    
    # Regular expression to capture the month, day, and year
    date_pattern = re.compile(r'([A-Za-z]+)\s?(\d{1,2}),?\s?(\d{4})')
    
    # Find matches using regex
    match = date_pattern.match(date_str)
    if match:
        month, day, year = match.groups()
        # Ensure proper formatting
        cleaned_date_str = f"{month} {int(day):02d}, {year}"
        return cleaned_date_str
    else:
        # Attempt fuzzy matching for months if regex fails
        best_match, score = process.extractOne(date_str, months)
        if score > 80:
            # Reconstruct date string with best match
            return best_match
        return date_str

def process_scholarships_data():
    # Use combined_df from previous steps
    combined_df = pd.read_csv("processed_scholarships_data.csv", delimiter=",", encoding="utf-8", low_memory=False)

    # Ensure the columns are of type text
    combined_df = combined_df.astype({"Scholarship Title": str, "Amount": str, "Due Date": str})

    # Apply the cleaning function to the "Due Date" column
    combined_df["Due Date"] = combined_df["Due Date"].apply(clean_date)

    # Convert Due Date to datetime
    combined_df["Due Date"] = pd.to_datetime(combined_df["Due Date"], format="%B %d, %Y", errors='coerce')

    # Drop rows where "Due Date" is NaT (null)
    combined_df = combined_df.dropna(subset=["Due Date"])

    # Add Custom column
    current_date = datetime.now().date()
    combined_df["Days til due"] = combined_df["Due Date"].apply(lambda x: (x.date() - current_date).days if pd.notnull(x) and (x.date() - current_date).days > 0 else 0)

    # Reformat the Due Date to month-day-year
    combined_df["Due Date"] = combined_df["Due Date"].dt.strftime("%m-%d-%Y")

    # Reorder columns
    combined_df = combined_df[["Days til due", "Amount", "Scholarship Title", "Due Date"]]

    # Sort rows by "Days til due"
    combined_df = combined_df.sort_values(by="Days til due")

    # Reset index
    combined_df = combined_df.reset_index(drop=True)

    # Save the updated CSV file
    combined_df.to_csv("processed_scholarships_data.csv", index=False)

    # Convert DataFrame to HTML and save to file
    html_table = combined_df.to_html(index=False)
    with open("scholarships_data.html", "w") as f:
        f.write(html_table)

    return combined_df

if __name__ == "__main__":
    processed_df = process_scholarships_data()
    print("Processed data has been updated and saved to scholarships_data.html")


Index([                                                                                                                                                                    '',
                                                                                                                                                          'Scholarship Title',
                                                                                                                                                                     'Amount',
                                                                                                                                                                   'Due Date',
                                                                                                                                                                         None,
                          'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-maj

In [16]:
import pandas as pd
from datetime import datetime
import re
from fuzzywuzzy import fuzz, process

# Use combined_df from previous steps
combined_df = pd.read_csv("scholarships_data.csv", delimiter=",", encoding="utf-8", low_memory=False)

# Ensure the columns are of type text
combined_df = combined_df.astype({"Scholarship Title": str, "Amount": str, "Due Date": str})

# Define a function to clean and format dates
def clean_date(date_str):
    # List of month names
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]
    
    # Regular expression to capture the month, day, and year
    date_pattern = re.compile(r'([A-Za-z]+)\s?(\d{1,2}),?\s?(\d{4})')
    
    # Find matches using regex
    match = date_pattern.match(date_str)
    if match:
        month, day, year = match.groups()
        # Ensure proper formatting
        cleaned_date_str = f"{month} {int(day):02d}, {year}"
        return cleaned_date_str
    else:
        # Attempt fuzzy matching for months if regex fails
        best_match, score = process.extractOne(date_str, months)
        if score > 80:
            # Reconstruct date string with best match
            return best_match
        return date_str

# Apply the cleaning function to the "Due Date" column
combined_df["Due Date"] = combined_df["Due Date"].apply(clean_date)

# Convert Due Date to datetime
combined_df["Due Date"] = pd.to_datetime(combined_df["Due Date"], format="%B %d, %Y", errors='coerce')

# Drop rows where "Due Date" is NaT (null)
combined_df = combined_df.dropna(subset=["Due Date"])

# Add Custom column
current_date = datetime.now().date()
combined_df["Days til due"] = combined_df["Due Date"].apply(lambda x: (x.date() - current_date).days if pd.notnull(x) and (x.date() - current_date).days > 0 else 0)

# Reformat the Due Date to month-day-year
combined_df["Due Date"] = combined_df["Due Date"].dt.strftime("%m-%d-%Y")

# Reorder columns
combined_df = combined_df[["Days til due", "Amount", "Scholarship Title", "Due Date"]]

# Sort rows by "Days til due"
combined_df = combined_df.sort_values(by="Days til due")

# Reset index
combined_df = combined_df.reset_index(drop=True)

print(combined_df)
html_table = combined_df.to_html(index=False)
with open("scholarships_data.html", "w") as f:
    f.write(html_table)
# Save the processed data to a new CSV file
combined_df.to_csv("processed_scholarships_data.csv", index=False)

      Days til due   Amount  \
0                0   $5,000   
1                0   $1,500   
2                0   $3,000   
3                1   $5,000   
4                3  $10,000   
...            ...      ...   
2314           363   $5,000   
2315           363   $1,000   
2316           363   $1,000   
2317           363   $5,000   
2318           448   $4,000   

                                      Scholarship Title    Due Date  
0         Reason One Mentorship and Scholarship Program  06-09-2024  
1                         Desire to Inspire Scholarship  06-09-2024  
2           Kentucky Nursing Incentive Scholarship Fund  06-08-2024  
3     Joseph Zukin Jr. Scholarship For Entrepreneurship  06-10-2024  
4     Anarcha, Betsy, and Lucy Memorial Scholarship ...  06-12-2024  
...                                                 ...         ...  
2314            ACAA Educational Foundation Scholarship  06-07-2025  
2315             The Ryan J. Gibbs Business Scholarship  06-07-2025

In [4]:
script_content = """
import pandas as pd
from datetime import datetime
import re
from fuzzywuzzy import fuzz, process

# Use combined_df from previous steps
combined_df = pd.read_csv("scholarships_data.csv", delimiter=",", encoding="utf-8", low_memory=False)

# Ensure the columns are of type text
combined_df = combined_df.astype({"Scholarship Title": str, "Amount": str, "Due Date": str})

# Define a function to clean and format dates
def clean_date(date_str):
    # List of month names
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]
    
    # Regular expression to capture the month, day, and year
    date_pattern = re.compile(r'([A-Za-z]+)\s?(\d{1,2}),?\s?(\d{4})')
    
    # Find matches using regex
    match = date_pattern.match(date_str)
    if match:
        month, day, year = match.groups()
        # Ensure proper formatting
        cleaned_date_str = f"{month} {int(day):02d}, {year}"
        return cleaned_date_str
    else:
        # Attempt fuzzy matching for months if regex fails
        best_match, score = process.extractOne(date_str, months)
        if score > 80:
            # Reconstruct date string with best match
            return best_match
        return date_str

# Apply the cleaning function to the "Due Date" column
combined_df["Due Date"] = combined_df["Due Date"].apply(clean_date)

# Convert Due Date to datetime
combined_df["Due Date"] = pd.to_datetime(combined_df["Due Date"], format="%B %d, %Y", errors='coerce')

# Drop rows where "Due Date" is NaT (null)
combined_df = combined_df.dropna(subset=["Due Date"])

# Add Custom column
current_date = datetime.now().date()
combined_df["Days til due"] = combined_df["Due Date"].apply(lambda x: (x.date() - current_date).days if pd.notnull(x) and (x.date() - current_date).days > 0 else 0)

# Reformat the Due Date to month-day-year
combined_df["Due Date"] = combined_df["Due Date"].dt.strftime("%m-%d-%Y")

# Reorder columns
combined_df = combined_df[["Days til due", "Amount", "Scholarship Title", "Due Date"]]

# Sort rows by "Days til due"
combined_df = combined_df.sort_values(by="Days til due")

# Reset index
combined_df = combined_df.reset_index(drop=True)

print(combined_df)
html_table = combined_df.to_html(index=False)
with open("scholarships_data.html", "w") as f:
    f.write(html_table)
# Save the processed data to a new CSV file
combined_df.to_csv("processed_scholarships_data.csv", index=False)
"""

with open("process_scholarships.py", "w") as file:
    file.write(script_content)


In [5]:
from flask import Flask, send_file

app = Flask(__name__)

@app.route('/')
def index():
    return send_file('scholarships_data.html')

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
!pip install schedule

In [None]:
import schedule
import time
import subprocess

def job():
    subprocess.run(["python", "process_scholarships.py"])

# Schedule the job every day at midnight
schedule.every().day.at("00:00").do(job)

while True:
    schedule.run_pending()
    time.sleep(1)
