# Webcrawling a Gaming Forum

In this assignment, the goal is to crawl a specific forum, extracting and organizing its posts into a CSV file. The following key tasks are to be accomplished:

1. Crawling Across Pages:
    - We Implement a web crawler to traverse through multiple pages of the forum.
    - We extract all posts, including replies, ensuring comprehensive coverage of the forum content.

2. Language Identification:
    - We identify the language of each post and include this information in a designated column.

3. Metainformation Inclusion:
    - We gather and include essential metainformation such as the author and date of each post.

4. Thread Reconstruction:
    - In order to facilitate the recreation of threads from the CSV data, we add a column providing reply-to information, specifically the parent ID for each post. This facilitates the recreation of threads from the CSV data.

5. CSV Generation:
    - Finally, we build a CSV file that organizes the extracted forum data according to the specified format and include code for Thread Reconstruction.

In [1]:
from urllib.request import Request, urlopen
import bs4


def fetch_soup(url: str) -> bs4.BeautifulSoup:
    """
    Fetch and parse the HTML content of a web page using BeautifulSoup.

    :param url: The URL of the web page to be fetched and parsed.
    :type url: str
    :return: A BeautifulSoup object representing the parsed HTML content.
    :rtype: bs4.BeautifulSoup
    """
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    html_page = urlopen(req).read()
    return bs4.BeautifulSoup(html_page, 'html.parser')

In [2]:
head_url = "https://forum.warmane.com"
warmane_url = "https://forum.warmane.com/forumdisplay.php?f=20"

In [3]:
import re


def extract_int(string: str) -> int|None:
    """
    Get the first integer from a string.
    :param string: the string
    :return: the integer if found, else None
    """
    number = re.findall("\d+", string)[0]
    if number is None:
        return None
    else:
        return int(number)

In [4]:
import datetime


class Post:
    """
    Represents a forum post, including contents, meta-infromation and relation to other posts in the thread.
    """
    
    def __init__(self, post_id: int, thread_id: int, author: str, contents: str, 
                 date: datetime.date, reply_to: int=None):
        """
        :param post_id: (int) The unique identifier for the post.
        :param thread_id : (int) The identifier of the thread to which the post belongs.
        :param author : (str) The author of the post.
        :param contents : (str) The content of the post.
        :param date : (date) The date and time when the post was created.
        :param reply_to : (int, optional) The identifier of the post being replied to, 
        None if first post in the thread.
        """
        self.id = post_id
        self.thread_id = thread_id
        self.author = author
        self.contents = contents
        self.date = date
        self.reply_to = reply_to
        
    def __eq__(self, other):
        return self.id == other.id
    
    def __hash__(self):
        return self.id
            
    def __str__(self) -> str:
        return f"Post {self.id} in thread {self.thread_id} by {self.author}, contents: '{self.contents[:50]}'"
    

class Thread:
    """
    Represents a forum thread. Contains meta-information as well as the thread's posts.
    """
    
    def __init__(self, thread_id, title, author):
        """
        Create a new Thread container.
        :param thread_id: The unique identifier for the thread.
        :param title: The title of the thread.
        :param author: The author of the thread.
        """
        self.id = thread_id
        self.title = title
        self.author = author
        self.posts = []
        
    def add_post(self, post: Post) -> None:
        """
        Add a post to the Thread.
        :param post: the post to be added
        """
        self.posts.append(post)
        
    def __str__(self) -> str:
        return f"Thread {self.id} by {self.author}, titled: '{self.title}'"

In [5]:
import dateparser


def parse_warmane_thread(head_url: str, thread: bs4.Tag) -> Thread:
    """
    Parse the HTML content of a Warmane forum thread and return a Thread object.

    :param head_url: The base URL of the forum.
    :type head_url: str
    :param thread: The BeautifulSoup Tag representing the thread to be parsed.
    :type thread: bs4.Tag
    :return: A Thread object representing the parsed thread.
    :rtype: Thread
    """
    title_tag = thread.find("a", {"class": "title"})
    title = title_tag.get_text().strip()
    thread_id = extract_int(title_tag.get("id"))
    author = thread.find("dl", {"class": "threadauthor td"}).get_text().strip()
    
    thread = Thread(thread_id, title, author)
    
    posts = parse_warmane_posts(head_url, thread_id)
    for post in posts:
        thread.add_post(post)
    #print(f' Thread {thread.id} Added {len(thread.posts)} posts, {len(set(thread.posts))} unique')

    return thread


def parse_warmane_posts(head_url: str, thread_id: int) -> list[Post]:
    """
    Parse the HTML content of Warmane forum posts within a thread and return a list of Post objects.

    :param head_url: The base URL of the forum.
    :type head_url: str 
    :param thread_id: The identifier of the thread.
    :type thread_id: int
    :return: A list of Post objects representing the parsed posts.
    :rtype: list[Post]
    """
    posts = []
    
    url = get_thread_url(head_url, thread_id)
    post_soup = fetch_soup(url)
    
    if post_soup is None:
        print("Error fetching " + url)

    post_tags = post_soup.find_all("ol", {"id": "posts"})[0].find_all("li", recursive=False)

    last_post_id = None
    
    #print(f"Found {len(post_tags)} post tags")
    for post_tag in post_tags:
        try:
            post_id = extract_int(post_tag.get("id"))
            post_author = post_tag.find(class_="userinfo").find("strong").get_text().strip()
            content = post_tag.find(class_="content").find("blockquote").get_text().strip()

            date_text = post_tag.find(class_="date").find("a").get_text().strip()
            date     = dateparser.parse(date_text)

            post = Post(post_id, thread_id, post_author, content, date, last_post_id)
            last_post_id = post_id
            posts.append(post)
        except Exception as e:
            print("ERROR: Failed to get information on post ", url)
    
    return posts
    

def get_thread_url(head_url: str, thread_id: str) -> str:
    """
    Generate the URL for a specific Warmane forum thread.

    :param head_url: The base URL of the forum.
    :type head_url: str
    :param thread_id: The identifier of the thread.
    :type thread_id: str
    :return: The URL of the specified thread.
    :rtype: str
    """
    return f"{head_url}/showthread.php?t={thread_id}"

In [6]:
from tqdm import tqdm


threads = []

for page in range(1, 9):
    url = warmane_url + f"&page={page}"
    soup = fetch_soup(url)
    
    print(f"Processing page {page} of 8...")
    thread_tags = soup.find_all("li", {"class": "threadbit"})
    for thread_tag in tqdm(thread_tags):
        thread = parse_warmane_thread(head_url, thread_tag)
        threads.append(thread)

Processing page 1 of 8...


100%|##################################################################################| 20/20 [00:04<00:00,  4.55it/s]


Processing page 2 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.31it/s]


Processing page 3 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.31it/s]


Processing page 4 of 8...


100%|##################################################################################| 20/20 [00:04<00:00,  4.97it/s]


Processing page 5 of 8...


100%|##################################################################################| 20/20 [00:04<00:00,  4.86it/s]


Processing page 6 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.21it/s]


Processing page 7 of 8...


 55%|#############################################1                                    | 11/20 [00:02<00:02,  4.48it/s]

ERROR: Failed to get information on post  https://forum.warmane.com/showthread.php?t=272585


100%|##################################################################################| 20/20 [00:04<00:00,  4.85it/s]


Processing page 8 of 8...


 50%|##########################################                                          | 4/8 [00:00<00:00,  5.90it/s]

ERROR: Failed to get information on post  https://forum.warmane.com/showthread.php?t=278731


100%|####################################################################################| 8/8 [00:01<00:00,  5.37it/s]


In [7]:
import itertools

# flatten nested lists 
posts = set(itertools.chain.from_iterable([thread.posts for thread in threads]))
len(posts)

413

In [8]:
import pandas as pd

df = pd.DataFrame.from_records([post.__dict__ for post in posts], index="id")
df

Unnamed: 0_level_0,thread_id,author,contents,date,reply_to
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2926596,384475,Ripsin,"Kalhspera paides,\n\r\nEimai arketo kairo ston...",2018-05-22,
2473988,300013,v4gflo,geia sas.psaxnw ellhniko guild ston Deathwing ...,2015-06-17,
2420747,290921,AlexPan,"Καλησπέρα παιδιά, το πρόβλημα είναι το εξής. \...",2015-03-24,
2981903,399822,xAchillesGate4x,Καλησπέρα παίδες. Ψάχνω Ελληνικό active raidin...,2019-03-03,
2879517,371804,Csdas,Opoios gnwrizei kati as mou kanei /w Dremoria ...,2017-11-29,
...,...,...,...,...,...
2877428,353812,Shiverbro,kalos private aksizei na ksekiniseis paidia?,2017-11-21,2875915.0
3069941,423611,crystallenia898,Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...,2020-07-26,3068345.0
2801654,350071,Draculation,Bump! ICC25 6/12,2017-05-07,2795443.0
2873339,370241,Ripsin,Kalhspera tha ithela na rwthsw an kapoios gnwr...,2017-11-07,


In [9]:
df.reply_to = df.reply_to.fillna(-1).astype(int)

In [10]:
df.date

id
2926596   2018-05-22
2473988   2015-06-17
2420747   2015-03-24
2981903   2019-03-03
2879517   2017-11-29
             ...    
2877428   2017-11-21
3069941   2020-07-26
2801654   2017-05-07
2873339   2017-11-07
2410495   2015-03-13
Name: date, Length: 413, dtype: datetime64[ns]

In [11]:
import langdetect

# Code adapted from John Pavlopoulos
# docstring from ChatGPT
def detect_language(contents: str) -> str | None:
    """
    Detect the language of the given text content, with a focus on identifying Greeklish.

    :param contents: The text content to analyze for language detection.
    :type contents: str
    :return: A two-letter language code indicating the detected language (e.g., "el" for Greek, "en" for English),
    or None if detection fails.
    :rtype: str or None

    This function attempts to detect if the text contains Greeklish (a mix of Greek and Latin characters). 
    If Greeklish is detected, the function returns "el" (indicating Greek).
    Otherwise, it uses the langdetect library to determine the language. 
    If langdetect encounters an exception during detection, the function returns None.

    Example:

    language = detect_language("Hello, πώς είσαι;")
    print(language)
    # Output: "el" (indicating Greek)
    """
    # Creating a regular expression pattern to match Latin characters (A-Z, a-z)
    greeklish_pattern = re.compile(r'[A-Za-z]+', re.UNICODE)
    # Searching for the pattern in the text
    match = greeklish_pattern.search(contents)
    # If a match is found, return True (indicating Greeklish is present)
    if match is not None:
        lang = "el"
    # else detect using library
    else:
        try:
            lang = langdetect.detect(contents)
        except langdetect.LangDetectException:
            lang = None
            
    return lang

df["language"] = df.contents.apply(detect_language)
df

Unnamed: 0_level_0,thread_id,author,contents,date,reply_to,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2926596,384475,Ripsin,"Kalhspera paides,\n\r\nEimai arketo kairo ston...",2018-05-22,-1,el
2473988,300013,v4gflo,geia sas.psaxnw ellhniko guild ston Deathwing ...,2015-06-17,-1,el
2420747,290921,AlexPan,"Καλησπέρα παιδιά, το πρόβλημα είναι το εξής. \...",2015-03-24,-1,el
2981903,399822,xAchillesGate4x,Καλησπέρα παίδες. Ψάχνω Ελληνικό active raidin...,2019-03-03,-1,el
2879517,371804,Csdas,Opoios gnwrizei kati as mou kanei /w Dremoria ...,2017-11-29,-1,el
...,...,...,...,...,...,...
2877428,353812,Shiverbro,kalos private aksizei na ksekiniseis paidia?,2017-11-21,2875915,el
3069941,423611,crystallenia898,Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...,2020-07-26,3068345,el
2801654,350071,Draculation,Bump! ICC25 6/12,2017-05-07,2795443,el
2873339,370241,Ripsin,Kalhspera tha ithela na rwthsw an kapoios gnwr...,2017-11-07,-1,el


In [12]:
df[df.language != "el"]

Unnamed: 0_level_0,thread_id,author,contents,date,reply_to,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3082464,427259,malakas17,,2020-10-20,3081822,
3113236,427259,malakas17,,2021-05-12,3113009,
3099161,431660,malakas17,,2021-02-10,3096432,
3113819,427259,malakas17,,2021-05-16,3113236,
3099593,427259,boonick,,2021-02-14,3093400,
3081820,427259,malakas17,,2020-10-16,3080427,
3081822,427259,malakas17,,2020-10-16,3081820,


In [13]:
df.to_csv("warmane_greek.csv")

In [14]:
def reconstruct_thread(post_df: pd.DataFrame, thread_id: int) -> pd.DataFrame:
    """
    Reconstruct a thread from a DataFrame containing post data.

    :param post_df: The DataFrame containing post data.
    :type post_df: pd.DataFrame
    :param thread_id: The identifier of the thread to be reconstructed.
    :type thread_id: int
    :return: A DataFrame containing posts belonging to the specified thread, sorted by post ID.
    :rtype: pd.DataFrame
    """
    thread_posts = post_df[post_df.thread_id == thread_id]
    return thread_posts.reset_index().sort_values("id")

post_df = pd.read_csv("warmane_greek.csv")
reconstruct_thread(pd.read_csv("warmane_greek.csv"), 447248)

Unnamed: 0,index,id,thread_id,author,contents,date,reply_to,language
3,395,3151797,447248,pipernan,LORDAERON x1 / Horde\n\r\nΓεια χαρα!\n\r\nΨαχν...,2022-05-19 00:00:00.000000,-1,el
0,155,3199709,447248,LockandRoll18,geia sou eimai ston lordaeron. character: Pat...,2023-10-14 11:31:43.774107,3151797,el
1,176,3199795,447248,JMarou,Geia sas alania ego onyxia einai akribos oti e...,2023-10-14 11:31:43.776106,3199709,el
2,222,3200067,447248,LockandRoll18,Originally Posted by JMarou\n\nGeia sas alania...,2023-10-21 11:31:43.778107,3199795,el
