## 1. Install dependencies

In [None]:
%pip install tqdm

## 2. Extraction data step 1

In [None]:
# coding: utf-8

"""
Ce programme n'utilise pas de parallélisation donc il risque de prendre un peu plus de temps
"""

import os
import re
import gzip
import sys
from tqdm import tqdm 
from collections import namedtuple

FILEPROPS=namedtuple("Fileprops", "parser num_fields column_indexes")

#CATEGORYLINKS_PARSER=re.compile(r'(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>\'.*?\'?),(?P<row3>\'[0-9\ \-:]+\'?),(?P<row4>\'\'?),(?P<row5>\'.*?\'?),(?P<row6>\'.*?\'?)')
CATEGORYLINKS_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>\'.*?\'?),(?P<row3>\'[0-9\ \-:]+\'?),(?P<row4>\'.*?\'?),(?P<row5>\'[a-z\-]*?\'?),(?P<row6>\'[a-z]+\'?)$')
PAGELINKS_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>[0-9]+?)$')
LANGLINKS_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>\'.*?\'?)$')
REDIRECT_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>-?[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>\'.*?\'?),(?P<row4>\'.*?\'?)$')
CATEGORY_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>[0-9]+?),(?P<row3>[0-9]+?),(?P<row4>[0-9]+?)$')
PAGE_PROPS_PARSER=re.compile(r'^([0-9]+),(\'.*?\'),(\'.*?\'),(\'[0-9\ \-:]+\'),(\'\'),(\'.*?\'),(\'.*?\')$')
PAGE_PARSER=re.compile((r'^(?P<row0>[0-9]+?),(?P<row1>[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>[0-9]+?),(?P<row4>[0-9]?),'
    r'(?P<row5>[0-9\.]+?),(?P<row6>\'.*?\'?),(?P<row7>(?P<row7val>\'.*?\'?)|(?P<row7null>NULL)),(?P<row8>[0-9]+?),(?P<row9>[0-9]+?),'
    r'(?P<row10>(?P<row10val>\'.*?\'?)|(?P<row10null>NULL)),(?P<row11>(?P<row11val>\'.*?\'?)|(?P<row11null>NULL))$'))


"""
# page
`page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
`page_namespace` int(11) NOT NULL DEFAULT 0,
`page_title` varbinary(255) NOT NULL DEFAULT '',
`page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
`page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
`page_random` double unsigned NOT NULL DEFAULT 0,
`page_touched` varbinary(14) NOT NULL,
`page_links_updated` varbinary(14) DEFAULT NULL,
`page_latest` int(8) unsigned NOT NULL DEFAULT 0,
`page_len` int(8) unsigned NOT NULL DEFAULT 0,
`page_content_model` varbinary(32) DEFAULT NULL,
`page_lang` varbinary(35) DEFAULT NULL,

#langlinks
`ll_from` int(8) unsigned NOT NULL DEFAULT 0,
`ll_lang` varbinary(35) NOT NULL DEFAULT '',
`ll_title` varbinary(255) NOT NULL DEFAULT '',

# pagelinks
`pl_from` int(8) unsigned NOT NULL DEFAULT '0',
`pl_namespace` int(11) NOT NULL DEFAULT '0',
`pl_title` varbinary(255) NOT NULL DEFAULT '',
`pl_from_namespace` int(11) NOT NULL DEFAULT '0',


"""


FILETYPE_PROPS=dict(
        categorylinks=FILEPROPS(CATEGORYLINKS_PARSER, 7, (0, 1, 6)),
        pagelinks=FILEPROPS(PAGELINKS_PARSER, 4, (0, 1, 2, 3)),
        langlinks=FILEPROPS(LANGLINKS_PARSER, 3, (0, 1, 2)),
        redirect=FILEPROPS(REDIRECT_PARSER, 5, (0, 1, 2)),
        category=FILEPROPS(CATEGORY_PARSER, 5, (0, 1, 2, 3, 4)),
        page_props=FILEPROPS(PAGE_PROPS_PARSER, 7, (0, 1)),
        page=FILEPROPS(PAGE_PARSER, 12, (0, 1, 2, 3, 9, 10, 11)),
        )

#VALUE_PARSER=re.compile(r'\(([0-9]+),(\'.*?\'),(\'.*?\'),(\'[0-9\ \-:]+\'),(\'\'),(\'.*?\'),(\'.*?\')\)')

In [None]:

def parse_match(match, column_indexes):
    row = match.groupdict()
    return tuple(row["row{}".format(i)] for i in column_indexes)


def parse_value(value, parser, column_indexes, value_idx=0, pbar=None):
    # replace unicode dash with ascii dash
    value = value.replace("\\xe2\\x80\\x93", "-")
    parsed_correctly = False
    for i, match in enumerate(parser.finditer(value)):
        parsed_correctly = True
        try:
            row = parse_match(match, column_indexes)
            yield row
        except Exception as e:
            print("Line: {!r}, Exception: {}".format(value, e), file=sys.stderr)
    if not parsed_correctly:
        print("Line: {!r}, IDX: {}, Exception: {}".format(value, value_idx, "Unable to parse."), file=sys.stderr)


def process_insert_values_line(line, parser, column_indexes, count_inserts=0, pbar=None):
    start, partition, values = line.partition(' VALUES ')
    # Each insert statement has format: 
    # INSERT INTO "table_name" VALUES (v1,v2,v3),(v1,v2,v3),(v1,v2,v3);
    # When splitting by "),(" we need to only consider string from values[1:-2]
    # This ignores the starting "(" and ending ");"
    values = values.strip()[1:-2].split("),(")
    pbar.set_postfix(found_values=len(values), insert_num=count_inserts)
    for value_idx, value in enumerate(values):
        for row in parse_value(value, parser, column_indexes, value_idx, pbar):
            yield row


def process_file(fp, fp_out, filetype, column_indexes=None, silent=False):
    if filetype not in FILETYPE_PROPS:
        raise Exception("Invalid filetype: {}".format(filetype))
    parser, num_fields, ci = FILETYPE_PROPS[filetype]
    print("Parser: {}\nnum_fields: {}\nci: {}".format(parser, num_fields, ci))
    valid_row_keys = set(["row{}".format(i) for i in range(num_fields)])
    if column_indexes is None:
        column_indexes = ci
    with tqdm(disable=silent) as pbar:
        count_inserts = 0
        for line_no, line in enumerate(fp, start=1):
            if line.startswith('INSERT INTO `{}` VALUES '.format(filetype)):
                count_inserts += 1
                for row in process_insert_values_line(
                        line, parser, column_indexes, count_inserts, pbar):
                    if pbar is not None:
                        pbar.update(1)
                    print("\t".join(row), file=fp_out)


In [None]:

def main():
    files_to_open = {
        "page":{
            "input":"ruwiki-latest-page.sql.gz", 
            "output":"pages_encoded_version.csv"
        }, 
        "langlinks":{
            "input":"ruwiki-latest-langlinks.sql.gz", 
            "output":"langlinks_encoded_version.csv"
        }
    }

    data_folder = 'data' 
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
        
    for database_name in files_to_open:
        with gzip.open(files_to_open[database_name]["input"], 'rt', encoding='ascii', errors='backslashreplace') as fp, open(os.path.join(data_folder, files_to_open[database_name]["output"]), 'wt', encoding='utf-8') as fp_out:
            process_file(fp, fp_out, database_name, column_indexes=None, silent=None)
    print("End of Processing")
        

In [None]:
main()

## 3. compute and save decoded data

In [None]:
# coding: utf-8

"""
Note: Le multithreading est utilisé pour decomposer le travail et le repartir sur les differents procésseur.
Cela contribu à l'optimisation de la vitesse de traitement des données. 
Et empeche de faire bugger l'ordinateur car au lieu de faire une grosse tache l'ordinateur fait plusieurs petite taches.
"""

#%%
import os
import threading
from collections import defaultdict, namedtuple
from typing import DefaultDict, List


In [None]:

MAX_THREAD_START_NUMBER = 11
MAX_BATCH = 1000
INPUT_DATA_FOLDER = "data"
OUTPUT_DATA_FOLDER = "decoded"
LOCK = threading.RLock()

build_url = lambda url: f"https://ru.wikipedia.org/wiki/{url}"

In [None]:
#Creation du répertoire de sortie de données
if not os.path.exists(OUTPUT_DATA_FOLDER):
    os.makedirs(OUTPUT_DATA_FOLDER)

In [None]:
list_strings:DefaultDict[int, List[str]] = defaultdict(list)
pages_langs = defaultdict(dict)
pages_descriptions:DefaultDict[int, List[str]] = defaultdict(list)


In [None]:

def make_lang_mapper(key:int)->(None):
    """
    Creer un table de mappage clé-valeur. Cette table de mappage permet de lister les differentes langue dans
    lesquels un article et traduits et son titre dans la version traduite.

    Args:
        key (int): The key of the given map who contains a MAX_BATCH line (for ex: 1000 lines of the input file).

    Returns:
        None
    """

    #Note la valeur est un dictionary dont les clés sont les codes iso correspondant à chaque langue et la valeur le titre de l'article
    # dans la langue traduite

    #Note: Cette approche permet de reduire la compléxité de fouille d'une langue de O(N) (où N est le nombre totale de langue existant 
    # sur wikipedia) à O(1). Cela permet de trouver si une traduction existe en 1 seule operation
    for line in list_strings[key]:
        with LOCK:
            ll_from, ll_lang, ll_title = line.strip().split("\t")
            ll_title = ll_title.strip().strip("'").strip().encode("utf-8").decode("utf8").strip()
            # Si le titre dans une langue correspond à une chaine de charactère vide alors la traduction dans cette langue n'existe pas
            if ll_title != "":
                pages_langs[int(ll_from)][ll_lang.strip("'")] = ll_title

    with LOCK:
        #Libération de l'espace occupé en mémoire par les n lignes traitées.
        list_strings[key].clear()
        list_strings.pop(key)


def find_right_fit_and_export(key:int, output_without_fr_en_file, output_without_fr_file):
    """
        Cette procédure permet de trouver si une article ne possède pas la traduction francaise ou francaise et anglaise

        Args:
            key (int): The key of the given map who contains a MAX_BATCH line (for ex: 1000 lines of the input file).

        Returns:
            None
        
    """
    for line in pages_descriptions[key]:
        with LOCK:
            page_id, namespace, title, is_redirect = line.strip().split("\t")
            if int(namespace) != 0 or int(is_redirect) != 0:
                continue
            
            #Les titres étant des chaines de caractères sous forme de bytes array il faut effectuer decodage customizé
            decoded_title = decode_text(b"{0}".format(title.strip().strip("'")))
            url = build_url(decoded_title)


            #Tester si l'article courant n'a pas de traduction en francais et en anglais si oui l'Enregistrez dans un fichier
            #sinon regarder s'il n'a pas de traduction en francais uniquement si oui l'enregistrer si non ne rien faire
            langs_set = pages_langs.get(int(page_id))
            if (langs_set == None) or (langs_set != None and (langs_set.get("en") in (None, "")) and (langs_set.get("fr") in (None, ""))):
                    print(f"{page_id}\t\t{decoded_title}\t\t{url}", file=output_without_fr_en_file)
            elif (langs_set != None and (langs_set.get("fr") in (None, ""))):
                    print(f"{page_id}\t\t{decoded_title}\t\t{url}", file=output_without_fr_file)
    with LOCK:
        #Libération de l'espace occupé en mémoire par les n lignes traitées.
        pages_descriptions[key].clear()
        pages_descriptions.pop(key)


def decode_text(encoded_text:str) -> (str):
        """
        Décodé le texte fournie en entrée et retourné le texte décodé en sortie.
        
        Args:
            encoded_text: le texte encodé.
        
        Returns: 
            Le texte décodé.
        """
    
        ExtractedString = namedtuple("ExtractedString", ["string", "index"])  

        #Comme les texts encode peuvent contenir à la fois des bytes array et des chaines 
        # de caractères qui ne sont pas décodable, il faut juste extraire chaque chaine
        # decodable et les concerver ainsi que leurs positions dans le texte original
        # pour les rajouter apres.
        
        start_pos = encoded_text.find("\\")
        if start_pos != -1:
            extractedstrings = []
            bytes_stack = []
            idx = 0

            if start_pos > 0:
                extractedstrings.append(ExtractedString(encoded_text[:start_pos], 0))
                idx = 1

            for byte_part in encoded_text[start_pos:].split("\\x"):
                try:
                    _ = int(byte_part, 16)
                    if len(byte_part) == 2:
                        bytes_stack.append(byte_part)
                    else:
                        raise Exception
                except:
                    flag = byte_part
                    try:
                        _ = int(byte_part[:2], 16)
                        bytes_stack.append(byte_part[:2])
                        flag = byte_part[2:]
                    except:
                        pass
                    extractedstrings.append(ExtractedString(flag, len(bytes_stack)//2 + idx))
            bytes_string = " ".join(bytes_stack)

            decoded_text_array = list(bytearray.fromhex(bytes_string).decode())

            #Replace extractedString in the decoded string
            counter = 0
            for extractedstring in extractedstrings:
                decoded_text_array.insert(extractedstring.index + counter, extractedstring.string)
                counter += 1
                
            #Merge the string in decoded array
            return "".join(decoded_text_array)
        return encoded_text

In [None]:
#Ouverture du fichier de lang et construction de la table de mappage 
# des articles aux langues dans lesquels ils sont traduit
counter = 1
key = 0


lang_file_name = "langlinks_encoded_version.csv"

lang_file = open(os.path.join(INPUT_DATA_FOLDER, lang_file_name), "rt", encoding="utf-8")

threads_alive = len(threading.enumerate())
print("openning lang_file")
print("Successfully openend")
print("Looping on entries")
for line in lang_file:
    list_strings[key].append(line)
    if counter % MAX_BATCH == 0:
        while len(threading.enumerate()) >= MAX_THREAD_START_NUMBER - 1:
            flag = 1
        th = threading.Thread(target=make_lang_mapper, args=(key,))
        th.daemon = True
        th.start()
        key += 1
    counter += 1
else:
    while len(threading.enumerate()) >= MAX_THREAD_START_NUMBER - 1:
        flag = 1
    th = threading.Thread(target=make_lang_mapper, args=(key,))
    th.daemon = True
    th.start()

print('wait ends of thread')
while len(threading.enumerate()) >= threads_alive:
    print(len(threading.enumerate()))
    flag = 1
lang_file.close()
print("Pages langs dictionary is successfull builded")


In [None]:
#Ouverture du fichier contenant les articles et extraction de ceux qui n'ont
# ni de traduction en france; ni de traduction en francais et en anglais


print("openning page file")

threads_alive = len(threading.enumerate())

page_file_name = "pages_encoded_version.csv"
output_without_fr_name = "database_without_articles_translate_in_fr.csv"
output_without_fr_en_name = "database_without_articles_translate_in_fr_and_en.csv"

key = 0
counter = 1

page_file = open(os.path.join(INPUT_DATA_FOLDER, page_file_name), "rt", encoding="utf-8")
output_without_fr_file = open(os.path.join(OUTPUT_DATA_FOLDER, output_without_fr_name), "wt", encoding="utf-8")
output_without_fr_en_file = open(os.path.join(OUTPUT_DATA_FOLDER,output_without_fr_en_name), "wt", encoding="utf-8")
    

print("Successfully openned")

print("page_ids\t\tpage_titles\t\turl", file=output_without_fr_file)
print("page_ids\t\tpage_titles\t\turl", file=output_without_fr_en_file)

print("Looping on entries and insert data in ouput file")
for line in page_file:
    pages_descriptions[key].append(line)
    if counter % MAX_BATCH == 0:
        while len(threading.enumerate()) >= MAX_THREAD_START_NUMBER - 1:
            flag = 1
        th = threading.Thread(target=find_right_fit_and_export, args=(key, output_without_fr_en_file, output_without_fr_file))
        th.daemon = True
        th.start()
        key += 1
    counter += 1
else:
    while len(threading.enumerate()) >= MAX_THREAD_START_NUMBER - 1:
        flag = 1
    th = threading.Thread(target=find_right_fit_and_export, args=(key, output_without_fr_en_file, output_without_fr_file))
    th.daemon = True
    th.start()

print('wait ends of thread')
while len(threading.enumerate()) > threads_alive:
    print(len(threading.enumerate()))
    flag = 1
    
page_file.close()
output_without_fr_file.close()
output_without_fr_en_file.close()

print("Merging is end")
print("End of Processing")