# Web Scrapping

In [None]:
import csv
import time
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

# Function to check the robots.txt file and get crawl delay (if available)
def get_crawl_delay(url):
    robots_url = urljoin(url, "/robots.txt")
    response = requests.get(robots_url)
    if response.status_code == 200:
        robots_content = response.text
        for line in robots_content.split('\n'):
            if line.startswith("Crawl-delay:"):
                delay = float(line.split(":")[1].strip())
                return delay
    return None

# Function to scrape publication details from a given URL with delay
def scrape_publications(url, crawl_delay):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    publications = []

    for publication in soup.select('li.list-result-item'):
        title_element = publication.select_one('h3.title > a')
        title = title_element.get_text(strip=True) if title_element else "Title Not Found"
        publication_url = title_element['href'] if title_element else "Publication URL Not Found"

        author_element = publication.select_one('a.link.person')
        author = author_element.get_text(strip=True) if author_element else "Author Not Found"
        author_url = author_element['href'] if author_element else "Author URL Not Found"

        publication_year_element = publication.select_one('span.date')
        publication_year = publication_year_element.get_text(strip=True) if publication_year_element else "Publication Year Not Found"

        publications.append((title, author, publication_year, publication_url, author_url))

    # Introduce a delay to be polite to the website
    if crawl_delay:
        time.sleep(crawl_delay)

    return publications

# Function to save the results to a CSV file
def save_to_csv(publications, csv_file):
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Author', 'Publication_Year', 'Publication_URL', 'Author_URL'])
        for title, author, publication_year, publication_url, author_url in publications:
            writer.writerow([title, author, publication_year, publication_url, author_url])

# Main function to initiate crawling and saving data to CSV
def main():
    base_url = "https://pureportal.coventry.ac.uk"
    start_url = urljoin(base_url, "/en/organisations/centre-global-learning/publications/")
    csv_file = 'publication_titles_and_hyperlinks.csv'

    # Check crawl delay from robots.txt
    crawl_delay = get_crawl_delay(base_url)
    if crawl_delay:
        print("Crawl delay:", crawl_delay, "seconds")

    # Initialize an empty list to store all publications
    all_publications = []

    # Initialize variables for pagination
    current_page = 1
    count = 0

    # Crawl the pages and extract data with delay
    while True:
        result_temp = scrape_publications(start_url + f"?page={current_page-1}", crawl_delay)

        # Check if any results were found on the current page
        if not result_temp:
            # If there are no results, stop crawling
            print('End Of Pages!!!')
            break

        if result_temp[0][0] == "Title Not Found" and result_temp[0][1] == "Author Not Found" and result_temp[0][2] == "Publication Year Not Found":
            # If there are no results, stop crawling
            # print('End Of Pages!!!')
            break
        else:
            all_publications.extend(result_temp)
            # print(f"Scraped {len(result_temp)} publications from {start_url}?page={current_page}")
            count += len(result_temp)
            current_page += 1

    print("Total number of publications:", count)

    # Find the total number of distinct authors and maximum publications per author
    authors = {}  # Dictionary to store author names and their publication counts
    max_publications = 0

    for title, author, publication_year, publication_url, author_url in all_publications:
        if author not in authors:
            authors[author] = 1
        else:
            authors[author] += 1

        # Updating most publications per author
        if authors[author] > max_publications:
            max_publications = authors[author]

    print("Total number of distinct authors:", len(authors))
    print("Maximum publications per author:", max_publications)

    # Save the results to a CSV file
    save_to_csv(all_publications, csv_file)
    print("Results saved to", csv_file)

if __name__ == "__main__":
    main()

End Of Pages!!!
Total number of publications: 285
Total number of distinct authors: 51
Maximum publications per author: 47
Results saved to publication_titles_and_hyperlinks.csv


# Pre-processing

In [None]:
from bs4 import BeautifulSoup
import requests
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to apply pre-processing tasks to text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove non-alphanumeric characters and split the text into words
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the tokens using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]



    # Join the processed words back into a string
    processed_text = ' '.join(words)

    return processed_text


url = "https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/"
page = requests.get(url)

soup = BeautifulSoup(page.text, "html.parser")

all_titles = soup.find_all("h3", class_="title")
all_author = soup.find_all("a", class_="link person")
publication_years = soup.find_all("span", class_="date")
author_links = [author.get('href') for author in all_author]
title_links = [title.a.get('href') for title in all_titles]

# Create a list to store the data
data = []

for title, author, pub_year, author_link, link in zip(all_titles, all_author, publication_years, all_author, title_links):
    # Extract the required information
    title_text = title.span.text.strip()
    author_text = author.string
    pub_year_text = pub_year.text
    author_link_text = author_link.get('href')
    publication_link = title.a.get('href')

    # Clean and preprocess the title and author text
    cleaned_title = re.sub(r'[^\w\s]', '', title_text)  # Remove special characters
    cleaned_author = re.sub(r'[^\w\s]', '', author_text)  # Remove special characters
    processed_title = preprocess_text(cleaned_title)
    processed_author = preprocess_text(cleaned_author)

    # Append the data to the list with additional metadata (if available)
    data.append([title_text, author_text, pub_year_text, author_link_text, publication_link,
                 processed_title, processed_author])

# Define the CSV file name
csv_file = "publication_data_preprocessed.csv"

# Write the data to the CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Title", "Author", "Publication Years", "Authors Profile", "Publication Link",
                     "Processed Title", "Processed Author"])
    # Write the data rows
    writer.writerows(data)

print("Preprocessed data saved to CSV file:", csv_file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Preprocessed data saved to CSV file: publication_data_preprocessed.csv


# Inverted Indexing

In [None]:
import csv
from collections import defaultdict

# Function to load preprocessed data from CSV file
def load_preprocessed_data(csv_file):
    data = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            title, processed_title = row
            data.append((title, processed_title))
    return data

# Function to build the inverted index
def build_inverted_index(docs):
    index = defaultdict(list)
    for doc_id, (_, processed_title) in enumerate(docs, start=1):
        terms = processed_title.split()
        for term in terms:
            index[term].append(doc_id)
    return index

# Load preprocessed data from CSV file
preprocessed_csv_file = "publication_data_preprocessed.csv"
docs = load_preprocessed_data(preprocessed_csv_file)

# Build the inverted index
index = build_inverted_index(docs)

# Define the CSV file name for the indexed data (inverted index)
indexed_csv_file = "manual_inverted_index.csv"

# Write the indexed data (inverted index) to the CSV file
with open(indexed_csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Term", "Document IDs"])
    # Write the data rows
    writer.writerows(index.items())

# Print some part of the constructed inverted index
print("Inverted Index:")
for term, doc_ids in list(index.items())[:10]:
    print(f"{term}: {doc_ids}")

print("Indexed data saved to CSV file:", indexed_csv_file)


Inverted Index:
revisit: [1]
role: [1, 18]
gender: [1, 47]
moderating: [1]
effect: [1]
emotional: [1, 40]
intelligence: [1, 40]
leadership: [1, 20]
effectiveness: [1]
study: [1, 39, 43]
Indexed data saved to CSV file: manual_inverted_index.csv


# Incremental Indexing

In [None]:
import csv
from collections import defaultdict

# Function to load preprocessed data from CSV file
def load_preprocessed_data(csv_file):
    data = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            title, processed_title = row
            data.append((title, processed_title))
    return data

# Function to build the inverted index
def build_inverted_index(docs):
    index = defaultdict(list)
    for doc_id, (_, processed_title) in enumerate(docs, start=1):
        terms = processed_title.split()
        for term in terms:
            index[term].append(doc_id)
    return index

# Load preprocessed data from CSV file
preprocessed_csv_file = "publication_data_preprocessed.csv"
docs = load_preprocessed_data(preprocessed_csv_file)

# Define the CSV file name for the indexed data (inverted index)
indexed_csv_file = "manual_inverted_index.csv"

try:
    # Load the existing index from CSV file
    with open(indexed_csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        existing_index = defaultdict(list)
        for row in reader:
            term, doc_ids = row
            existing_index[term] = [int(doc_id) for doc_id in doc_ids.strip('[]').split(',') if doc_id.strip()]
except FileNotFoundError:
    # If the index file is not found, initialize an empty index
    existing_index = defaultdict(list)

# Update the index with new data from the crawler component
new_data = [
    ("New Publication Title 1", "new publication title 1 processed"),
    ("New Publication Title 2", "new publication title 2 processed"),
    # Add new data in the same format as the preprocessed data (title, processed_title)
    # Replace the above dummy data with the actual data received from the crawler
]

for doc_id, (_, processed_title) in enumerate(new_data, start=len(docs) + 1):
    terms = processed_title.split()
    for term in terms:
        existing_index[term].append(doc_id)

# Save the updated index back to the CSV file
with open(indexed_csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Term", "Document IDs"])
    # Write the data rows
    for term, doc_ids in existing_index.items():
        writer.writerow([term, ','.join(map(str, doc_ids))])

# Print some part of the constructed inverted index
print("Inverted Index:")
for term, doc_ids in list(existing_index.items())[:10]:
    print(f"{term}: {doc_ids}")

print("Indexed data saved to CSV file:", indexed_csv_file)


Inverted Index:
revisit: [1]
role: [1, 18]
gender: [1, 47]
moderating: [1]
effect: [1]
emotional: [1, 40]
intelligence: [1, 40]
leadership: [1, 20]
effectiveness: [1]
study: [1, 39, 43]
Indexed data saved to CSV file: manual_inverted_index.csv


# Query

In [None]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from collections import defaultdict
import math

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Function to apply pre-processing tasks to text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text and remove non-alphanumeric characters
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the tokens using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the processed words back into a string
    processed_text = ' '.join(words)

    return processed_text

# Function to load the indexed data from CSV file
def load_indexed_data(csv_file):
    index = defaultdict(list)
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            term, doc_ids = row
            index[term] = [int(doc_id) for doc_id in doc_ids.strip('[]').split(',') if doc_id.strip()]
    return index

# Function to perform ranked retrieval using vector space model
def ranked_retrieval(query, index, docs, top_n=10):
    query = preprocess_text(query)
    query_terms = query.split()

    # Calculate the document frequency for each query term
    doc_freq = {term: len(index[term]) for term in query_terms if term in index}

    # Calculate the inverse document frequency for each query term
    N = len(docs)  # Total number of documents
    idf = {term: math.log(N / df) for term, df in doc_freq.items()}

    # Calculate the term frequency in the query
    tf_query = {term: query_terms.count(term) for term in query_terms}

    # Calculate the vector space representation of the query
    query_vector = {term: tf_query[term] * idf[term] for term in query_terms if term in idf}

    # Calculate the relevance score (rank) for each document
    relevance_scores = defaultdict(float)
    for term in query_terms:
        if term in index:
            for doc_id in index[term]:
                tf_doc = docs[doc_id - 1][1].split().count(term)  # Term frequency in the document
                tfidf = tf_doc * idf[term]  # Term frequency-inverse document frequency
                relevance_scores[doc_id] += tfidf * query_vector[term]

    # Sort the documents based on relevance scores in descending order
    ranked_docs = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top N relevant documents
    top_docs = ranked_docs[:top_n]

    return top_docs

# Load the indexed data (inverted index) from the CSV file
indexed_csv_file = "manual_inverted_index.csv"
index = load_indexed_data(indexed_csv_file)

# Load preprocessed data from CSV file
preprocessed_csv_file = "publication_data_preprocessed.csv"
with open(preprocessed_csv_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header row
    docs = [(title, processed_title) for title, processed_title in reader]

# Command-line interface
while True:
    user_query = input("Enter your query (or 'exit' to quit): ")
    if user_query.lower() == 'exit':
        break

    # Perform ranked retrieval and display the top 5 publications
    top_publications = ranked_retrieval(user_query, index, docs, top_n=5)

    if not top_publications:
        print("No matching publications found.")
    else:
        print("Top 5 Relevant Publications:")
        for doc_id, relevance_score in top_publications:
            title = docs[doc_id - 1][0]
            print(f"Publication Title: {title}")
            print(f"Relevance Score: {relevance_score}")
            print("--------------------------------------------")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your query (or 'exit' to quit): nigeian
No matching publications found.
Enter your query (or 'exit' to quit): sensitivity
No matching publications found.
Enter your query (or 'exit' to quit): becoming nigerian
Top 5 Relevant Publications:
Publication Title: Becoming Nigerian
Relevance Score: 25.665085570920002
--------------------------------------------
Publication Title: What does it Mean to be Educated in Nigerian Student Experience?
Relevance Score: 10.361161575920939
--------------------------------------------
Enter your query (or 'exit' to quit): mechanism
Top 5 Relevant Publications:
Publication Title: 'Nordic added value’: a floating signifier and a mechanism for Nordic higher education regionalism
Relevance Score: 15.303923994999064
--------------------------------------------
Enter your query (or 'exit' to quit): cultural 
Top 5 Relevant Publications:
Publication Title: Interactive Media for Cultural Heritage book: "From Headsets to Mindsets: A Taxonomy for Human-centr

In [None]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from collections import defaultdict
import math

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Function to apply pre-processing tasks to text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize the text and remove non-alphanumeric characters
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the tokens using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the processed words back into a string
    processed_text = ' '.join(words)

    return processed_text

# Function to load the indexed data from CSV file
def load_indexed_data(csv_file):
    index = defaultdict(list)
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            term, doc_ids = row
            index[term] = [int(doc_id) for doc_id in doc_ids.strip('[]').split(',') if doc_id.strip()]
    return index

# Function to perform ranked retrieval using vector space model
def ranked_retrieval(query, index, docs, top_n=10):
    query = preprocess_text(query)
    query_terms = query.split()

    # Calculate the document frequency for each query term
    doc_freq = {term: len(index[term]) for term in query_terms if term in index}

    # Calculate the inverse document frequency for each query term
    N = len(docs)  # Total number of documents
    idf = {term: math.log(N / df) for term, df in doc_freq.items()}

    # Calculate the term frequency in the query
    tf_query = {term: query_terms.count(term) for term in query_terms}

    # Calculate the vector space representation of the query
    query_vector = {term: tf_query[term] * idf[term] for term in query_terms if term in idf}

    # Calculate the relevance score (rank) for each document
    relevance_scores = defaultdict(float)
    for term in query_terms:
        if term in index:
            for doc_id in index[term]:
                tf_doc = docs[doc_id - 1][1].split().count(term)  # Term frequency in the document
                tfidf = tf_doc * idf[term]  # Term frequency-inverse document frequency
                relevance_scores[doc_id] += tfidf * query_vector[term]

    # Sort the documents based on relevance scores in descending order
    ranked_docs = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top N relevant documents
    top_docs = ranked_docs[:top_n]

    return top_docs

# Load the indexed data (inverted index) from the CSV file
indexed_csv_file = "manual_inverted_index.csv"
index = load_indexed_data(indexed_csv_file)

# Load preprocessed data from CSV file
preprocessed_csv_file = "publication_data_preprocessed.csv"
with open(preprocessed_csv_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header row
    docs = [(title, processed_title) for title, processed_title in reader]

# Command-line interface
while True:
    user_query = input("Enter your query (or 'exit' to quit): ")
    if user_query.lower() == 'exit':
        break

    # Perform ranked retrieval and display the top 5 publications
    top_publications = ranked_retrieval(user_query, index, docs, top_n=5)

    if not top_publications:
        print("No matching publications found.")
    else:
        print("Top 5 Relevant Publications:")
        for doc_id, relevance_score in top_publications:
            title = docs[doc_id - 1][0]
            print(f"Publication Title: {title}")
            print(f"Relevance Score: {relevance_score}")
            print("--------------------------------------------")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your query (or 'exit' to quit): zoltán,
Top 5 Relevant Publications:
Publication Title: Obituary for Zoltán Dörnyei (1960–2022): a bibliometric mapping of his publications
Relevance Score: 15.303923994999064
--------------------------------------------
Enter your query (or 'exit' to quit): Zoltan,
No matching publications found.
Enter your query (or 'exit' to quit): mapping,
Top 5 Relevant Publications:
Publication Title: Obituary for Zoltán Dörnyei (1960–2022): a bibliometric mapping of his publications
Relevance Score: 10.361161575920939
--------------------------------------------
Publication Title: A bibliometric mapping of shadow education research: achievements, limitations, and the future
Relevance Score: 10.361161575920939
--------------------------------------------
Enter your query (or 'exit' to quit): mapping:
Top 5 Relevant Publications:
Publication Title: Obituary for Zoltán Dörnyei (1960–2022): a bibliometric mapping of his publications
Relevance Score: 10.361161575