## Szó keresgélő alkalmazás Mamának

A program letölti wikipediáról: https://hu.wiktionary.org/wiki/Index:Magyar a szavakat és berendezi őket hosszúság és kezdőbetű szerint csv file-okba.

In [33]:
# loading the modules:
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import os.path
from os import remove
import csv

In [2]:
# downloading the html of the website that lists the 5 letter words
url = 'https://hu.wiktionary.org/wiki/Index:Magyar/a'
html_file = "a_html.txt"
data_dir = "words/html"

# creating the directory if it does not exist:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
# reading the html of the site and storing it in the target file:
urllib.request.urlretrieve(url, os.path.join(data_dir, html_file))

('words/html/a_html.txt', <http.client.HTTPMessage at 0x7fc0148ebd60>)

In [5]:
print(open(os.path.join(data_dir, html_file)).read())


<!DOCTYPE html>
<html class="client-nojs" lang="hu" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Index:Magyar/a – Wikiszótár</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ymd","wgMonthNames":["","január","február","március","április","május","június","július","augusztus","szeptember","október","november","december"],"wgRequestId":"XpaZBgpAMM4AAKiwbPYAAAAW","wgCSPNonce":!1,"wgCanonicalNamespace":"Index","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":102,"wgPageName":"Index:Magyar/a","wgTitle":"Magyar/a","wgCurRevisionId":2350014,"wgRevisionId":2350014,"wgArticleId":114871,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Magyar szavak listája"],"wgPageContentLanguage":"hu","wgPageContentModel":"wikitext","wgRelevantPageName":"Index:Magyar/a","wgRelevantArticleId":114871,"wgIsPro

In [34]:
def get_words_from_link(url, html_dir='words/html', html_file='html.txt'):
    """This function downloads the html of a wiktionary page and extracts the words from it."""
    
    # creating the directory if it does not exist:
    if not os.path.exists(html_dir):
        os.makedirs(html_dir)
    
    # reading the html of the site and storing it in the target file:
    urllib.request.urlretrieve(url, os.path.join(html_dir, html_file))
    
    # parsing the html:
    with open(os.path.join(html_dir, html_file)) as f:
        soup = BeautifulSoup(f.read())
    
    # deleting the text file
    os.remove(os.path.join(html_dir, html_file))
    
    # finding all the parts of the html which list the words:
    h2_tags = soup.find_all('p')
    a_tags = []
    for tag in h2_tags:
        a_tags += tag.find_all('a')

    h2_tags = None

    # extracting the text (words):
    word_list = []
    for line in a_tags:
        word_list.append(line.text)
        
    return word_list   

In [36]:
url = 'https://hu.wiktionary.org/wiki/Index:Magyar/a'
html_file = "a_html.txt"
html_dir = "words/html"

word_list = get_words_from_link(url=url, html_dir=html_dir, html_file=html_file)

In [37]:
word_list[-10:]

['azt',
 'aztán',
 'áztat',
 'áztató',
 'azték',
 'azúr',
 'azurit',
 'azúrkék',
 'azután',
 'azzal']

In [46]:
def write_words_to_files(word_list, data_dir='words', letter='a', lengths=[]):
    """This function saves the words in word list into csv files according to their lengths.
    It creates a directories under data_dir with the corresponding lengths and saves the words
    starting nto csv files under these directories.
    Inputs: 
        word_list: list of strings.
        data_dir: directory where folders for the different lengts are going to be created.
        letter: initial letter of the words in word list. (also the name of the csv files)
        lengths: specified lengths of words which need to be saved.
    """
    for length in lengths:
        # current directory to save words with length:
        current_dir = os.path.join(data_dir, str(length))
        if not os.path.exists(current_dir):
            os.makedirs(current_dir)
        
        # extracting the words with length:
        words_of_length = [word for word in word_list if len(word) == length]

        # writing the csv file:
        file_name = letter + '.csv'
        with open(os.path.join(current_dir, file_name), 'w') as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(words_of_length)    

    print('Writting the words with \'{}\' and lengths {} is done.'.format(letter, lengths))
 

In [47]:
write_words_to_files(word_list=word_list, data_dir='words', letter='a', lengths=[2, 3])

Writting the words with 'a' and lengths [2, 3] is done.
