# KEC Educational Content Analysis with Machine Learning
## Web Scraping and Classification System for Form 1-4 Content

This notebook scrapes educational content from https://kec.ac.ke/ and builds ML models for content classification and question answering.

In [1]:
# Install required packages
!pip install requests beautifulsoup4 selenium pandas numpy scikit-learn nltk transformers torch
!pip install webdriver-manager lxml matplotlib seaborn wordcloud

Defaulting to user installation because normal site-packages is not writeable
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Do


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting lxml
  Downloading lxml-6.0.1-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.6-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.2-cp312-cp312-win_amd64.whl.metadata (111 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp312-cp312-win_amd64.whl.metada


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import re
from urllib.parse import urljoin, urlparse
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ML and NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to C:\Users\Denno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Denno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Denno/nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\Denno/nltk_data...


True

## 1. Web Scraping KEC Website

In [5]:
class KECWebScraper:
    def __init__(self):
        self.base_url = "https://kec.ac.ke/"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.scraped_data = []
        
    def setup_selenium_driver(self):
        """Setup Selenium WebDriver for dynamic content"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        service = Service(ChromeDriverManager().install())
        return webdriver.Chrome(service=service, options=options)
    
    def scrape_main_page(self):
        """Scrape the main KEC page to find educational content links"""
        try:
            response = self.session.get(self.base_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find education level sections
            education_sections = {
                'pre_primary': [],
                'primary': [],
                'junior_secondary': [],
                'secondary': []
            }
            
            # Look for grade/form links
            links = soup.find_all('a', href=True)
            for link in links:
                href = link.get('href')
                text = link.get_text().lower().strip()
                
                if any(grade in text for grade in ['form 1', 'form 2', 'form 3', 'form 4']):
                    education_sections['secondary'].append({
                        'url': urljoin(self.base_url, href),
                        'title': link.get_text().strip(),
                        'level': 'secondary'
                    })
            
            return education_sections
        except Exception as e:
            print(f"Error scraping main page: {e}")
            return {}
    
    def scrape_content_page(self, url, title, level):
        """Scrape individual content pages"""
        try:
            response = self.session.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract text content
            content_text = ""
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Get text content
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            content_text = ' '.join(chunk for chunk in chunks if chunk)
            
            return {
                'url': url,
                'title': title,
                'level': level,
                'content': content_text,
                'word_count': len(content_text.split()),
                'scraped_at': time.strftime('%Y-%m-%d %H:%M:%S')
            }
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return None
    
    def scrape_all_content(self):
        """Main method to scrape all educational content"""
        print("Starting KEC website scraping...")
        
        # Get main page structure
        sections = self.scrape_main_page()
        
        # Scrape each section
        for section_name, links in sections.items():
            print(f"Scraping {section_name} section...")
            for link_info in links:
                content = self.scrape_content_page(
                    link_info['url'], 
                    link_info['title'], 
                    link_info['level']
                )
                if content:
                    self.scraped_data.append(content)
                time.sleep(1)  # Be respectful to the server
        
        return self.scraped_data

In [6]:
# Initialize and run scraper
scraper = KECWebScraper()
scraped_content = scraper.scrape_all_content()

print(f"Scraped {len(scraped_content)} pages of content")
if scraped_content:
    df_content = pd.DataFrame(scraped_content)
    print(df_content.head())

Starting KEC website scraping...
Scraping pre_primary section...
Scraping primary section...
Scraping junior_secondary section...
Scraping secondary section...
Scraped 4 pages of content
                                                 url   title      level  \
0  https://lms.kec.ac.ke/course/index.php?categor...  Form 1  secondary   
1  https://lms.kec.ac.ke/course/index.php?categor...  Form 2  secondary   
2  https://lms.kec.ac.ke/course/index.php?categor...  Form 3  secondary   
3  https://lms.kec.ac.ke/course/index.php?categor...  Form 4  secondary   

                                             content  word_count  \
0  Digital Content | Kenya Education Cloud | Unli...        5375   
1  Digital Content | Kenya Education Cloud | Unli...        5375   
2  Digital Content | Kenya Education Cloud | Unli...        5375   
3  Digital Content | Kenya Education Cloud | Unli...        5375   

            scraped_at  
0  2025-09-07 23:50:27  
1  2025-09-07 23:50:28  
2  2025-09-07 23:50:3