In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from goose3 import Goose
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import hstack
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
import pickle
import os

In [None]:
# Load the dataset
file_path = 'data/website_classification.csv'
website_data = pd.read_csv(file_path)

# Initialize Goose
g = Goose()

# Function to fetch HTML content and metadata
def fetch_website_data(url):
    try:
        # Use Goose to extract additional information
        article = g.extract(url=url)
        if not article:
            return None

        # Gather metadata and other info
        info = article.infos
        metadata = {
            'title': info.get('title'),
            'description': info.get('meta', {}).get('description'),
            'lang': info.get('meta', {}).get('lang'),
            'keywords': info.get('meta', {}).get('keywords'),
            'favicon': info.get('meta', {}).get('favicon'),
            'canonical': info.get('meta', {}).get('canonical'),
            'encoding': info.get('meta', {}).get('encoding'),
            'domain': info.get('domain'),
            'image': info.get('image'),
            'cleaned_text': info.get('cleaned_text'),
            'authors': info.get('authors'),
            'publish_date': info.get('publish_date')
        }

        # Additionally, fetch raw HTML content
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        metadata['html_content'] = response.text

        return metadata
    except Exception as e:
        return None

# Create a new list to store data with successfully fetched HTML content and layout categories
html_data = []

# Iterate through the DataFrame with a progress bar
for _, row in tqdm(website_data.iterrows(), total=len(website_data), desc="Processing websites"):
    url = row['website_url']
    category = row['Category']

    # Fetch website data using Goose
    website_data = fetch_website_data(url)

    if website_data:
      layout_type = category
      # Append only successfully fetched data
      website_data.update({
            'website_url': url,
            'category': category,
            'layout_type': layout_type
        })
      html_data.append(website_data)

    # Pause to avoid overloading the server
    time.sleep(0.5)

# Convert the results into a DataFrame
html_df = pd.DataFrame(html_data)


In [None]:
html_df.head()

In [None]:
html_df.to_csv('data/website_metadata_content.csv', index=False, escapechar='\\')

In [None]:
# Load dataset
html_df = pd.read_csv('data/website_metadata_content.csv')

# Define HTML tags for feature extraction
tags = ['div', 'p', 'a', 'button', 'input', 'form', 'img', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'section', 'article', 'header', 'footer', 'nav', 'video', 'audio', 'canvas']

# Function to extract HTML tag features
def extract_html_features(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    tag_counts = {tag: len(soup.find_all(tag)) for tag in tags}
    text_length = len(soup.get_text())
    html_length = len(html_content)
    text_html_ratio = text_length / html_length if html_length else 0
    features = list(tag_counts.values()) + [text_html_ratio]
    return features

# Apply feature extraction with a progress bar
print("Extracting HTML features...")
html_features = [
    extract_html_features(x) if pd.notnull(x) else [0] * (len(tags) + 1)
    for x in tqdm(html_df['html_content'], desc="Processing HTML content")
]

# Convert extracted features to a DataFrame
html_features_df = pd.DataFrame(html_features, columns=[f'{tag}_count' for tag in tags] + ['text_html_ratio'])
processed_df = pd.concat([html_df, html_features_df], axis=1)

# TF-IDF Vectorization for Text Fields
text_fields = ['title', 'description', 'cleaned_text', 'keywords']
tfidf_vectorizers = {}
text_vectors = []

print("Vectorizing text fields with TF-IDF...")
for field in text_fields:
    vectorizer = TfidfVectorizer(max_features=500)
    tfidf_matrix = vectorizer.fit_transform(processed_df[field].fillna(''))
    tfidf_vectorizers[field] = vectorizer
    text_vectors.append(tfidf_matrix)

# Combine TF-IDF vectors horizontally
combined_text_vectors = hstack(text_vectors)

# Domain-specific NLP embeddings with Sentence-BERT for titles and descriptions
print("Generating embeddings with Sentence-BERT...")
model = SentenceTransformer('all-MiniLM-L6-v2')
title_embeddings = model.encode(processed_df['title'].fillna(''), show_progress_bar=True)
description_embeddings = model.encode(processed_df['description'].fillna(''), show_progress_bar=True)

# Stack HTML features, TF-IDF, and embeddings together
html_numerical_features = processed_df[[f'{tag}_count' for tag in tags] + ['text_html_ratio']].values
X = hstack([combined_text_vectors, html_numerical_features, title_embeddings, description_embeddings])

# Encode target labels
y = processed_df['category']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Use SMOTE for class balancing
print("Applying SMOTE for balancing classes...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize RandomForest model and perform grid search
print("Hyperparameter tuning with GridSearchCV for Random Forest...")
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Use the best model found
best_rf_model = grid_search.best_estimator_

In [None]:
# Cross-validation for the best model
print("\nCross-validation for the best Random Forest model...")
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
cv_results = cross_validate(best_rf_model, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)

# Display cross-validation results
print("\nCross-Validation Results:")
for metric in scoring:
    print(f"{metric.capitalize()}:")
    print(f" - Mean: {np.mean(cv_results['test_' + metric]):.4f}")
    print(f" - Std Dev: {np.std(cv_results['test_' + metric]):.4f}")


In [None]:
# Visualization of cross-validation results
plt.figure(figsize=(10, 5))
plt.bar(scoring, [np.mean(cv_results['test_' + metric]) for metric in scoring],
        yerr=[np.std(cv_results['test_' + metric]) for metric in scoring], capsize=5)
plt.title("Cross-Validation Performance by Metric")
plt.ylabel("Score")
plt.show()


In [None]:
# Train and evaluate on the test set
print("\nTraining the final model on the full training set...")
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)

# Test accuracy and detailed classification report
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [None]:
# Visualize Confusion Matrix
ConfusionMatrixDisplay.from_estimator(best_rf_model, X_test, y_test, display_labels=label_encoder.classes_, cmap="Blues")
plt.title("Confusion Matrix of Test Set Predictions - Optimized Random Forest with SMOTE")
plt.show()


In [None]:
os.makedirs('models', exist_ok=True)

# Paths to save model and artifacts
model_path = 'models/random_forest_model.pkl'
label_encoder_path = 'models/label_encoder.pkl'
tfidf_vectorizers_path = 'models/tfidf_vectorizers.pkl'
sentence_bert_model_path = 'models/sentence_bert_model.pkl'

# Save RandomForest model
with open(model_path, 'wb') as model_file:
    pickle.dump(best_rf_model, model_file)

# Save Label Encoder
with open(label_encoder_path, 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

# Save TF-IDF vectorizers
with open(tfidf_vectorizers_path, 'wb') as tfidf_file:
    pickle.dump(tfidf_vectorizers, tfidf_file)

# Save Sentence-BERT model (though typically, it's reloaded directly by its name)
sentence_bert_model_name = 'all-MiniLM-L6-v2'
print(f"Model and artifacts saved successfully. For Sentence-BERT, use model name '{sentence_bert_model_name}' to load it.")


In [None]:
# Load saved model and artifacts
with open(model_path, 'rb') as model_file:
    model = pickle.load(model_file)
with open(label_encoder_path, 'rb') as le_file:
    label_encoder = pickle.load(le_file)
with open(tfidf_vectorizers_path, 'rb') as tfidf_file:
    tfidf_vectorizers = pickle.load(tfidf_file)


In [None]:
# Load Sentence-BERT
sentence_model = SentenceTransformer(sentence_bert_model_name)

# List of tags to count in HTML
tags = ['div', 'p', 'a', 'button', 'input', 'form', 'img', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'section', 'article', 'header', 'footer', 'nav', 'video', 'audio', 'canvas']

# Function to fetch and preprocess data from URL
def fetch_and_predict(url):
    # Initialize Goose for metadata extraction
    g = Goose()
    article = g.extract(url=url)

    if not article:
        return {"error": "Failed to retrieve article content"}

    # Extract Goose data
    goose_data = {
        'title': article.title,
        'description': article.meta_description,
        'keywords': article.meta_keywords,
        'cleaned_text': article.cleaned_text
    }

    # Fetch raw HTML for structural features
    response = requests.get(url)
    if response.status_code != 200:
        return {"error": "Failed to retrieve HTML content"}
    html_content = response.text

    # HTML structural feature extraction
    soup = BeautifulSoup(html_content, 'html.parser')
    tag_counts = [len(soup.find_all(tag)) for tag in tags]
    text_length = len(soup.get_text())
    html_length = len(html_content)
    text_html_ratio = text_length / html_length if html_length else 0
    html_features = tag_counts + [text_html_ratio]

    # Vectorize text fields
    title_vector = tfidf_vectorizers['title'].transform([goose_data['title'] or ''])
    description_vector = tfidf_vectorizers['description'].transform([goose_data['description'] or ''])
    text_vector = tfidf_vectorizers['cleaned_text'].transform([goose_data['cleaned_text'] or ''])

    # Embed title and description using Sentence-BERT
    title_embedding = sentence_model.encode(goose_data['title'] or '', show_progress_bar=False).reshape(1, -1)
    description_embedding = sentence_model.encode(goose_data['description'] or '', show_progress_bar=False).reshape(1, -1)

    # Combine features
    combined_features = hstack([title_vector, description_vector, text_vector, np.array(html_features).reshape(1, -1), title_embedding, description_embedding])

    # Pad with zeros if fewer features (if model expects more features)
    expected_features = model.n_features_in_
    if combined_features.shape[1] < expected_features:
        padding = np.zeros((1, expected_features - combined_features.shape[1]))
        combined_features = hstack([combined_features, padding])

    # Predict category
    predicted_label = model.predict(combined_features)
    predicted_category = label_encoder.inverse_transform(predicted_label)[0]

    # Return Goose data and prediction
    return {
        'goose_data': goose_data,
        'predicted_category': predicted_category
    }


In [None]:
# Example usage
url = "http://www.scholarpedia.org/article/Bayesian_statistics"
result = fetch_and_predict(url)
if 'error' in result:
    print(result['error'])
else:
    print("Goose Data:", result['goose_data'])
    print("Predicted Category:", result['predicted_category'])
