## Regex classifier

In [1]:
from sklearn.base import BaseEstimator
import numpy as np
import re


#idea credit https://www.linkedin.com/pulse/regex-one-pattern-rule-them-all-find-bring-darkness-bind-carranza/
class RegexClassifier(BaseEstimator):
    """
    Language Classifier using Regular Expressions
    """

    language_regex_dict = {
        'el': r'([\u0370-\u03FF]+)\s?',  # Greek
        'en': r'([a-zA-Z]+)\s?',  # English
    }

    # STATIC INITALIZATION BLOCK
    # pre-compie all expressions to save execution time
    for lang in language_regex_dict.keys():
        language_regex_dict[lang] = re.compile(language_regex_dict[lang])


    def __init__(self, include_greeklish=True):
        """
        Initialize a new :class:`RegexClassifier` instance.

        :param include_greeklish: Optional. If True, include Greeklish in the language identification process.
        """
        self.include_greeklish = include_greeklish

    def predict(self, x):
        preds = []

        for text in x:
            pred = None

            if self.include_greeklish and \
                re.search(RegexClassifier.language_regex_dict["el"], text) and \
                re.search(RegexClassifier.language_regex_dict["en"], text):
                    pred = "greeklish"

            for lang_code, regex_pattern in RegexClassifier.language_regex_dict.items():
                match = re.search(regex_pattern, text)
                if match:
                    pred = lang_code
                    break
            preds.append(pred)
        return np.array(preds)

In [2]:
# Example usage
user_input_text = ["你好，これはサンプルです。안녕하세요", "this is an english sentence", "δεν αντεχω αλλο"]
detected_language = RegexClassifier().predict(user_input_text)
print(f"Detected Language Code: {detected_language}")

Detected Language Code: [None 'en' 'el']


## Defining the Gold Standard


https://aclanthology.org/L06-1229/
https://aclanthology.org/L04-1369/
https://huggingface.co/datasets/papluca/language-identification

In [3]:
import pandas as pd


def dataset_to_pd(dataset_dict: dict) -> pd.DataFrame:
    df_ls = []
    label_array = np.empty(shape=(sum([len(dataset) for dataset in dataset_dict.values()])), dtype=object)
    last_idx = -1

    for label, dataset in dataset_dict.items():
        new_last_idx = len(dataset) + last_idx
        label_array.put(np.arange(last_idx+1, new_last_idx+1, 1), label)
        last_idx = new_last_idx

        df_ls.append(pd.DataFrame(dataset))

    full_df = pd.concat(df_ls, ignore_index=True)
    full_df["set"] = label_array
    full_df.insert(0, "set", full_df.pop("set"))

    return full_df

In [4]:
from datasets import load_dataset


dataset_dict = load_dataset("papluca/language-identification")
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [5]:
gold1_df = dataset_to_pd(dataset_dict)
gold1_df

Unnamed: 0,set,labels,text
0,train,pt,"os chefes de defesa da estónia, letónia, lituâ..."
1,train,bg,размерът на хоризонталната мрежа може да бъде ...
2,train,zh,很好，以前从不去评价，不知道浪费了多少积分，现在知道积分可以换钱，就要好好评价了，后来我就把...
3,train,th,สำหรับ ของเก่า ที่ จริงจัง ลอง honeychurch ...
4,train,ru,Он увеличил давление .
...,...,...,...
89995,test,zh,史料很充分，对岸的很多观点与大陆迥异啊。
89996,test,tr,"Örneğin, teşhis Yunanca bir kelimeden alındı (..."
89997,test,vi,Nếu lite/light chỉ đơn giản là mô tả một đặc t...
89998,test,bg,"Например, една щатска столица, която посетихме..."


In [6]:
gold1_df = gold1_df.loc[gold1_df.labels.eq("el") | gold1_df.labels.eq("en"), ["labels", "text"]]
gold1_df

Unnamed: 0,labels,text
18,el,"Παρά τον εαυτό μου , η γνώμη μου για τη σύνεση..."
39,en,Didnt really seem to work much.
40,el,Ακόμα και το να ξέρεις ότι ήταν ένα άγαλμα που...
49,en,Highly recommend for those who don't like bein...
75,el,Έχω κάνει τέσσερις ή πέντε .
...,...,...
89961,en,"It's super cute, really soft. Print is fine bu..."
89965,en,"One of them worked, the other one didn't. Ther..."
89978,en,I only received one out of the three strikers :(
89982,el,Οι οργανισμοί πρέπει να είναι σε θέση να μετρή...


In [7]:
head_url = "https://forum.warmane.com"
warmane_url = "https://forum.warmane.com/forumdisplay.php?f=20"

In [8]:
from src.util import fetch_soup
from tasks.warmane import parse_warmane_thread
from tqdm import tqdm


threads = []

for page in range(1, 9):
    url = warmane_url + f"&page={page}"
    soup = fetch_soup(url)

    print(f"Processing page {page} of 8...")
    thread_tags = soup.find_all("li", {"class": "threadbit"})
    for thread_tag in tqdm(thread_tags):
        thread = parse_warmane_thread(head_url, thread_tag)
        threads.append(thread)

Processing page 1 of 8...


100%|##################################################################################| 20/20 [00:04<00:00,  4.39it/s]


Processing page 2 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.65it/s]


Processing page 3 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.39it/s]


Processing page 4 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.32it/s]


Processing page 5 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.16it/s]


Processing page 6 of 8...


100%|##################################################################################| 20/20 [00:03<00:00,  5.65it/s]


Processing page 7 of 8...


 55%|#############################################1                                    | 11/20 [00:02<00:01,  4.54it/s]

ERROR: Failed to get information on post  https://forum.warmane.com/showthread.php?t=272585


100%|##################################################################################| 20/20 [00:04<00:00,  4.88it/s]


Processing page 8 of 8...


 50%|##########################################                                          | 4/8 [00:00<00:00,  5.91it/s]

ERROR: Failed to get information on post  https://forum.warmane.com/showthread.php?t=278731


100%|####################################################################################| 8/8 [00:01<00:00,  5.72it/s]


In [9]:
import itertools

# flatten nested lists 
posts = set(itertools.chain.from_iterable([thread.posts for thread in threads]))
len(posts)

414

In [10]:
import pandas as pd

warmane_df = pd.DataFrame.from_records([post.__dict__ for post in posts], index="id")
warmane_df.reply_to = warmane_df.reply_to.fillna(-1).astype(int)
warmane_df

Unnamed: 0_level_0,thread_id,author,contents,date,reply_to
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2926596,384475,Ripsin,"Kalhspera paides,\n\r\nEimai arketo kairo ston...",2018-05-22,-1
2473988,300013,v4gflo,geia sas.psaxnw ellhniko guild ston Deathwing ...,2015-06-17,-1
2420747,290921,AlexPan,"Καλησπέρα παιδιά, το πρόβλημα είναι το εξής. \...",2015-03-24,-1
2981903,399822,xAchillesGate4x,Καλησπέρα παίδες. Ψάχνω Ελληνικό active raidin...,2019-03-03,-1
2879517,371804,Csdas,Opoios gnwrizei kati as mou kanei /w Dremoria ...,2017-11-29,-1
...,...,...,...,...,...
2877428,353812,Shiverbro,kalos private aksizei na ksekiniseis paidia?,2017-11-21,2875915
3069941,423611,crystallenia898,Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...,2020-07-26,3068345
2801654,350071,Draculation,Bump! ICC25 6/12,2017-05-07,2795443
2873339,370241,Ripsin,Kalhspera tha ithela na rwthsw an kapoios gnwr...,2017-11-07,-1


In [11]:
empty_contents = warmane_df.contents.apply(lambda x: x.isspace() | len(x)==0)
warmane_df[empty_contents]

Unnamed: 0_level_0,thread_id,author,contents,date,reply_to
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3082464,427259,malakas17,,2020-10-20,3081822
3113236,427259,malakas17,,2021-05-12,3113009
3099161,431660,malakas17,,2021-02-10,3096432
3113819,427259,malakas17,,2021-05-16,3113236
3099593,427259,boonick,,2021-02-14,3093400
3081820,427259,malakas17,,2020-10-16,3080427
3081822,427259,malakas17,,2020-10-16,3081820


In [12]:
warmane_df = warmane_df[~empty_contents]

In [13]:
regex_model = RegexClassifier(include_greeklish=False)
preds = regex_model.predict(warmane_df.contents)

In [14]:
warmane_df[preds=="en"].contents

id
2926596    Kalhspera paides,\n\r\nEimai arketo kairo ston...
2473988    geia sas.psaxnw ellhniko guild ston Deathwing ...
2879517    Opoios gnwrizei kati as mou kanei /w Dremoria ...
2959390               Bubblethesap Icecrown wotlk horde belf
2947119    den se vrisko kane add evvi  .\nmou leei den u...
                                 ...                        
2557881                          Paokara einai padou file :P
2602989    Me to plevro ton Allience ayti ti fora Oi <<An...
2877428         kalos private aksizei na ksekiniseis paidia?
2801654                                     Bump! ICC25 6/12
2873339    Kalhspera tha ithela na rwthsw an kapoios gnwr...
Name: contents, Length: 227, dtype: object

In [15]:
warmane_df[preds=="greeklish"].contents

Series([], Name: contents, dtype: object)

In [16]:
warmane_df[preds=="el"].contents

id
2420747    Καλησπέρα παιδιά, το πρόβλημα είναι το εξής. \...
2981903    Καλησπέρα παίδες. Ψάχνω Ελληνικό active raidin...
2959391    Καλησπερα θα πας στις ιδιότητες (properties) σ...
2719776    Originally Posted by celphecil\n\nΚαλησπέρα Σω...
2971700    Ελληνικο guild , ψαχνουμε ατομα για runs ICC10...
                                 ...                        
2609122    πώρωση με τα μισά Dung, Quests και Raids bugge...
2707437    Λοιπόν μαγες,ξέρει κάποιος γιατί δεν γίνεται ν...
2777071    INACTIVE\n\n\n <Northern Defiance>  Καινούργια...
3069941    Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...
2410495    Εμπειρια εχω, αλλα δεν εχω χρονο...αυτο ειναι ...
Name: contents, Length: 180, dtype: object

In [17]:
labels = np.where(preds=="en", "greeklish", "el")
gold2_df = pd.DataFrame({"labels": labels, "text": warmane_df.contents})
gold2_df

Unnamed: 0_level_0,labels,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2926596,greeklish,"Kalhspera paides,\n\r\nEimai arketo kairo ston..."
2473988,greeklish,geia sas.psaxnw ellhniko guild ston Deathwing ...
2420747,el,"Καλησπέρα παιδιά, το πρόβλημα είναι το εξής. \..."
2981903,el,Καλησπέρα παίδες. Ψάχνω Ελληνικό active raidin...
2879517,greeklish,Opoios gnwrizei kati as mou kanei /w Dremoria ...
...,...,...
2877428,greeklish,kalos private aksizei na ksekiniseis paidia?
3069941,el,Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...
2801654,greeklish,Bump! ICC25 6/12
2873339,greeklish,Kalhspera tha ithela na rwthsw an kapoios gnwr...


In [18]:
gold_df = pd.concat([gold1_df, gold2_df]).reset_index()
gold_df

Unnamed: 0,index,labels,text
0,18,el,"Παρά τον εαυτό μου , η γνώμη μου για τη σύνεση..."
1,39,en,Didnt really seem to work much.
2,40,el,Ακόμα και το να ξέρεις ότι ήταν ένα άγαλμα που...
3,49,en,Highly recommend for those who don't like bein...
4,75,el,Έχω κάνει τέσσερις ή πέντε .
...,...,...,...
9402,2877428,greeklish,kalos private aksizei na ksekiniseis paidia?
9403,3069941,el,Ε μεις αυξανόμαστε και θα αρχίσουμε να στήνουμ...
9404,2801654,greeklish,Bump! ICC25 6/12
9405,2873339,greeklish,Kalhspera tha ithela na rwthsw an kapoios gnwr...


In [19]:
import os


gold_file = os.path.join("data", "gold.csv")
with open(gold_file, "w") as f:
    gold_df.to_csv(gold_file)
    print(f"Gold saved successfully as {gold_file}")

Gold saved successfully as data\gold.csv


## Youtube Crawling

In [20]:
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver


try:
    service = Service()
except Exception:
    service = Service(ChromeDriverManager().install())

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--lang=en')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=service, options=options)

In [21]:
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import bs4
import json, time


#code credit https://serpapi.com/blog/scrape-youtube-video-page-with-python/
def scrape_youtube(driver, search_url: str, max_scrolls:int=10, 
                   scroll_wait_secs: float=1, verbose: bool=False) -> bs4.BeautifulSoup:
    """
    Scrape YouTube search results using a Selenium WebDriver.

    :param driver: Selenium WebDriver instance.
    :type driver: selenium.webdriver.remote.webdriver.WebDriver

    :param search_url: The URL of the YouTube search results page.
    :type search_url: str

    :param max_scrolls: Optional. The maximum number of times to scroll down the page.
    :type max_scrolls: int, default: 10

    :param scroll_wait_secs: Optional. The number of seconds to wait between each scroll.
    :type scroll_wait_secs: float, default: 1

    :param verbose: Optional. If True, print progress.
    :type verbose: bool, default: False

    :return: A BeautifulSoup object representing the scraped YouTube search results page.
    :rtype: bs4.BeautifulSoup

    This function uses a Selenium WebDriver to open the specified YouTube search URL, scrolls down 
    the page up to the specified maximum number of times, and returns the BeautifulSoup object representing
    the page source.
    """
    driver.get(search_url)

    old_height = driver.execute_script("""
        function getHeight() {
            return document.querySelector('ytd-app').scrollHeight;
        }
        return getHeight();
    """)
    scrolled_times = 0

    while scrolled_times <= max_scrolls:
        if verbose:
            print(f"Scrolling ({scrolled_times} out of max {max_scrolls})...")

        driver.execute_script("window.scrollTo(0, document.querySelector('ytd-app').scrollHeight)")

        time.sleep(scroll_wait_secs)
        scrolled_times += 1

        new_height = driver.execute_script("""
            function getHeight() {
                return document.querySelector('ytd-app').scrollHeight;
            }
            return getHeight();
        """)

        if new_height == old_height:
            break

        old_height = new_height

    soup = bs4.BeautifulSoup(driver.page_source)

    return soup

In [22]:
# "greek songs" search in Greek
search_url = "https://www.youtube.com/results?search_query=%CE%B5%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA%CE%B1+%CF%84%CF%81%CE%B1%CE%B3%CE%BF%CF%85%CE%B4%CE%B9%CE%B1"
search_soup = scrape_youtube(driver, search_url, max_scrolls=15, verbose=True)

Scrolling (0 out of max 15)...
Scrolling (1 out of max 15)...
Scrolling (2 out of max 15)...
Scrolling (3 out of max 15)...
Scrolling (4 out of max 15)...
Scrolling (5 out of max 15)...
Scrolling (6 out of max 15)...
Scrolling (7 out of max 15)...
Scrolling (8 out of max 15)...
Scrolling (9 out of max 15)...
Scrolling (10 out of max 15)...
Scrolling (11 out of max 15)...
Scrolling (12 out of max 15)...
Scrolling (13 out of max 15)...
Scrolling (14 out of max 15)...
Scrolling (15 out of max 15)...


In [23]:
def extract_search_results(search_soup: bs4.BeautifulSoup) -> tuple[list[str], list[str]]:
    """
    Extract video titles and links from a YouTube search results page represented by a BeautifulSoup object.

    :param search_soup: A BeautifulSoup object representing the YouTube search results page.
    :type search_soup: bs4.BeautifulSoup

    :return: A tuple containing two lists - the first list represents video titles, and the second list represents video links.
    :rtype: tuple(list[str], list[str])
    """
    titles = []
    links = []
    video_tag = search_soup.find("ytd-app")

    for link_tag in video_tag.find_all("a", {"id": "video-title"}):
        titles.append(link_tag.get_text())
        links.append(link_tag.get("href"))

    return titles, links


results = extract_search_results(search_soup)
results_df = pd.DataFrame({"title": results[0], "link": results[1]})
results_df

Unnamed: 0,title,link
0,"\n\nΠου 'ναι τα χρόνια ! - 40 αθάνατα, αγαπημέ...",/watch?v=OdX0y96UPEA&pp=ygUjzrXOu867zrfOvc65zr...
1,\n\nΤΑ ΛΑΪΚΑ ΤΗΣ ΤΑΒΕΡΝΑΣ | NON STOP MIX - Που...,/watch?v=C4f3xcZzr3s&pp=ygUjzrXOu867zrfOvc65zr...
2,\n\nGreek Hits 2023 | Non-Stop Mix by Elegant ...,/watch?v=RcSAggke-_U&pp=ygUjzrXOu867zrfOvc65zr...
3,\n\nΜουσική ιστορία Νο.1 (μέρος πρώτο) - 100 χ...,/watch?v=p5g82ta4sTk&pp=ygUjzrXOu867zrfOvc65zr...
4,"\n\nΠουλόπουλος, Μπάσης, Γλυκερία, Ανδρεάτος, ...",/watch?v=cOGip_clrKY&pp=ygUjzrXOu867zrfOvc65zr...
...,...,...
286,\n\nΜιχάλης Χατζηγιάννης - Χορεύω | Official M...,/watch?v=1Y3h-3ka4bo&pp=ygUjzrXOu867zrfOvc65zr...
287,\n\nGreek & Int. Music Mix 2022 - Ελληνικα & Ξ...,/watch?v=rKqnlgp_mwA&pp=ygUjzrXOu867zrfOvc65zr...
288,\n\nDisney αλά ΕΛΛΗΝΙΚΑ | NeverLander\n,/watch?v=qmtC8SflKT0&pp=ygUjzrXOu867zrfOvc65zr...
289,\n\nΜανώλης Αγγελόπουλος - Νυχτερίδες κι αράχν...,/watch?v=YzJNzqxDUqQ&pp=ygUjzrXOu867zrfOvc65zr...


In [24]:
results_df.title = results_df.title.apply(lambda x: x.strip())
results_df.link = results_df.link.apply(lambda x: "https://www.youtube.com" + x) 
results_df

Unnamed: 0,title,link
0,"Που 'ναι τα χρόνια ! - 40 αθάνατα, αγαπημένα, ...",https://www.youtube.com/watch?v=OdX0y96UPEA&pp...
1,ΤΑ ΛΑΪΚΑ ΤΗΣ ΤΑΒΕΡΝΑΣ | NON STOP MIX - Πουλόπο...,https://www.youtube.com/watch?v=C4f3xcZzr3s&pp...
2,Greek Hits 2023 | Non-Stop Mix by Elegant Gree...,https://www.youtube.com/watch?v=RcSAggke-_U&pp...
3,Μουσική ιστορία Νο.1 (μέρος πρώτο) - 100 χρυσά...,https://www.youtube.com/watch?v=p5g82ta4sTk&pp...
4,"Πουλόπουλος, Μπάσης, Γλυκερία, Ανδρεάτος, Μπέλ...",https://www.youtube.com/watch?v=cOGip_clrKY&pp...
...,...,...
286,Μιχάλης Χατζηγιάννης - Χορεύω | Official Music...,https://www.youtube.com/watch?v=1Y3h-3ka4bo&pp...
287,Greek & Int. Music Mix 2022 - Ελληνικα & Ξενα ...,https://www.youtube.com/watch?v=rKqnlgp_mwA&pp...
288,Disney αλά ΕΛΛΗΝΙΚΑ | NeverLander,https://www.youtube.com/watch?v=qmtC8SflKT0&pp...
289,Μανώλης Αγγελόπουλος - Νυχτερίδες κι αράχνες (...,https://www.youtube.com/watch?v=YzJNzqxDUqQ&pp...


In [25]:
greeklish_model = RegexClassifier(include_greeklish=True)
preds = greeklish_model.predict(results_df.title)
non_en_res_df = results_df[(preds != "en") & (preds != None)]
non_en_res_df

Unnamed: 0,title,link
0,"Που 'ναι τα χρόνια ! - 40 αθάνατα, αγαπημένα, ...",https://www.youtube.com/watch?v=OdX0y96UPEA&pp...
1,ΤΑ ΛΑΪΚΑ ΤΗΣ ΤΑΒΕΡΝΑΣ | NON STOP MIX - Πουλόπο...,https://www.youtube.com/watch?v=C4f3xcZzr3s&pp...
3,Μουσική ιστορία Νο.1 (μέρος πρώτο) - 100 χρυσά...,https://www.youtube.com/watch?v=p5g82ta4sTk&pp...
4,"Πουλόπουλος, Μπάσης, Γλυκερία, Ανδρεάτος, Μπέλ...",https://www.youtube.com/watch?v=cOGip_clrKY&pp...
5,Τα λαϊκά που αγαπήσαμε - 100 ιστορικά τραγούδι...,https://www.youtube.com/watch?v=OuY-cviIIqM&pp...
...,...,...
286,Μιχάλης Χατζηγιάννης - Χορεύω | Official Music...,https://www.youtube.com/watch?v=1Y3h-3ka4bo&pp...
287,Greek & Int. Music Mix 2022 - Ελληνικα & Ξενα ...,https://www.youtube.com/watch?v=rKqnlgp_mwA&pp...
288,Disney αλά ΕΛΛΗΝΙΚΑ | NeverLander,https://www.youtube.com/watch?v=qmtC8SflKT0&pp...
289,Μανώλης Αγγελόπουλος - Νυχτερίδες κι αράχνες (...,https://www.youtube.com/watch?v=YzJNzqxDUqQ&pp...


In [26]:
def extract_comments(video_soup: bs4.BeautifulSoup) -> list[str]:
    """
    Extract comments from a YouTube video page represented by a BeautifulSoup object.

    :param video_soup: A BeautifulSoup object representing the YouTube video page.
    :type video_soup: bs4.BeautifulSoup

    :return: A list of strings representing the extracted comments.
    :rtype: list[str]
    """
    comment_tags = video_soup.find_all("yt-formatted-string", {"id": "content-text"})
    comments = [comment_tag.find("span") for comment_tag in comment_tags]
    comments = [comment.get_text() for comment in comments if comment is not None]
    return comments

In [34]:
from tqdm import tqdm


comments = []
scrap_limit = 20

for link in tqdm(non_en_res_df.link[0:scrap_limit]):
    try:
        video_soup = scrape_youtube(driver, link, scroll_wait_secs=1.5, max_scrolls=15, verbose=False)
        comments += extract_comments(video_soup)
    except Exception:
        print(f"ERROR: Cannot retrive comments from {link}, skipping...")

crawl_df = pd.DataFrame({"text": comments})
crawl_df

 90%|#########################################################################8        | 18/20 [04:47<00:37, 18.73s/it]

ERROR: Cannot retrive comments from https://www.youtube.com/watch?v=GfQAaMyUDuU&pp=ygUjzrXOu867zrfOvc65zrrOsSDPhM-BzrHOs86_z4XOtM65zrE%3D, skipping...


 95%|#############################################################################8    | 19/20 [04:52<00:14, 14.60s/it]

ERROR: Cannot retrive comments from https://www.youtube.com/watch?v=v0oHBqMnWAM&pp=ygUjzrXOu867zrfOvc65zrrOsSDPhM-BzrHOs86_z4XOtM65zrE%3D, skipping...


100%|##################################################################################| 20/20 [04:57<00:00, 14.89s/it]


Unnamed: 0,text
0,"Γιατί όταν το ακούω,δακρύζω."
1,Αχ που ναι τα χρόνια ωραία χρόνια .....κι ύστε...
2,Δροσιά μέσα στον καύσωνα που μας ταλαιπωρεί.
3,Μια δροσερη καλημερα! Ευχαριστουμε για την υπε...
4,Καλό μεσημέρι στη παρέα. Μια άλλη υπέροχη συλλ...
...,...
726,Linda ne qofse mundesh nuk e gjej
727,Khoranya pola pola
728,Τρηπλετα.γεια.
729,🇹🇷


In [35]:
crawl_df = pd.DataFrame({"text": comments})
crawl_df[crawl_df.text.apply(lambda x: x.isspace())]

Unnamed: 0,text
59,\n
206,
334,
353,


In [36]:
preds = greeklish_model.predict(crawl_df.text)
crawl_df[~((preds != "en") & (preds != None))]

Unnamed: 0,text
9,Hi Linda you're an awesome
18,️
47,Felicitări
48,Beautiful thank you
50,Thank you for this beautiful selection excelle...
...,...
725,Ta tres grande
726,Linda ne qofse mundesh nuk e gjej
727,Khoranya pola pola
729,🇹🇷


In [37]:
gr_crawl_df = crawl_df[(preds != "en") & (preds != None) & ~crawl_df.text.apply(lambda x: x.isspace())]
gr_crawl_df

Unnamed: 0,text
0,"Γιατί όταν το ακούω,δακρύζω."
1,Αχ που ναι τα χρόνια ωραία χρόνια .....κι ύστε...
2,Δροσιά μέσα στον καύσωνα που μας ταλαιπωρεί.
3,Μια δροσερη καλημερα! Ευχαριστουμε για την υπε...
4,Καλό μεσημέρι στη παρέα. Μια άλλη υπέροχη συλλ...
...,...
714,4 τέλειο κοματι μη κόλασ σε ένα φιλί σε μια αγ...
715,Ολα ένα κι ένα. Τρία μεγαθήρια μαζί.
716,Τελεια
722,Ένα μιξ με Τερζή και Ρέμο πλζζζ


In [38]:
crawl_file = os.path.join("data", "crawl.csv")
with open(crawl_file, "w") as f:
    gr_crawl_df.to_csv(crawl_file)
    print(f"Crawled data saved successfully as {crawl_file}")

Crawled data saved successfully as data\crawl.csv


In [39]:
driver.quit()