# WEB MINING - Diversity and inclusion

## DATA COLLECTION

Generals imports

In [None]:
import networkx as nx
from collections import deque, defaultdict
import random
import string

Configuration NLTK et Wikipedia 

In [None]:
import wikipedia 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

wikipedia.set_lang("en")
nltk.download('punkt')
nltk.download('stopwords')

Préparer les stopwords et le stemmer

In [None]:
stop_words = list(set(stopwords.words('english'))) + ["'s"]
stem = nltk.stem.SnowballStemmer("english")

Extraction des tokens

In [None]:
def extract_tokens(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stem.stem(token) for token in tokens]
    return tokens

Obtention des n tokens les plus fréquents

In [7]:
def get_top_tokens(content, n=20):
    tokens = extract_tokens(content)
    token_counts = defaultdict(int)
    for token in tokens:
        token_counts[token] += 1
    sorted_tokens = sorted(token_counts.items(), key=lambda item: item[1], reverse=True)
    return set(item[0] for item in sorted_tokens[:n])

Vérification de pertinence via les tokens principaux

In [None]:
def is_relevant_based_on_top_tokens(top_tokens, linked_summary, threshold=5):
    linked_tokens = extract_tokens(linked_summary)
    common_tokens = [token for token in linked_tokens if token in top_tokens]
    return len(common_tokens) >= threshold

Sauvegarde et chargement des datas (sous json)

In [None]:
import os
import json

def load_data(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    return {}

def save_data(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

Sauvegarde de liens (sous gml)

In [None]:
def save_links_as_gml(link_storage, gml_file):
    graph = nx.DiGraph()
    for source, targets in link_storage.items():
        for target in targets:
            graph.add_edge(source, target)
    nx.write_gml(graph, gml_file)
    print(f"Graph saved to {gml_file}")

Main fonction - scrapping and save

In [None]:
def scrape_and_store_with_recovery(start_page_title, max_depth=5, content_file='content.json', link_file='links.json',
                                   visited_file='visited.json', queue_file='queue.json'):
    content_storage = load_data(content_file)
    link_storage = load_data(link_file)
    visited_pages = set(load_data(visited_file))
    queue = deque(load_data(queue_file) if os.path.exists(queue_file) else [(start_page_title, 0)])

    main_page = wikipedia.WikipediaPage(start_page_title)
    main_content = main_page.content
    top_tokens = get_top_tokens(main_content, n=20)

    while queue:
        save_data(queue_file, list(queue))
        current_page_title, current_depth = queue.popleft()

        if current_depth >= max_depth or current_page_title in visited_pages:
            continue

        try:
            current_page = wikipedia.WikipediaPage(current_page_title)
            visited_pages.add(current_page_title)
            save_data(visited_file, list(visited_pages))
            content_storage[current_page_title] = current_page.content
            save_data(content_file, content_storage)

            for link in current_page.links:
                if link in visited_pages:
                    continue
                try:
                    linked_page = wikipedia.WikipediaPage(link)
                    if is_relevant_based_on_top_tokens(top_tokens, linked_page.summary, threshold=5):
                        if current_page_title not in link_storage:
                            link_storage[current_page_title] = []
                        link_storage[current_page_title].append(link)
                        save_data(link_file, link_storage)
                        queue.append((linked_page.title, current_depth + 1))
                except wikipedia.exceptions.DisambiguationError:
                    continue
                except wikipedia.exceptions.PageError:
                    continue
        except wikipedia.exceptions.DisambiguationError:
            continue
        except wikipedia.exceptions.PageError:
            continue

    save_data(queue_file, [])
    print("Scraping completed.")

Lancement du scrapping et sauvegarde 

In [None]:
scrape_and_store_with_recovery("Diversity (business)", max_depth=5)

link_storage = load_data('links.json')
save_links_as_gml(link_storage, 'graph.gml')

## TEXT MINING

## LINKS ANALYSIS