In [1]:
import os
import re
import urllib
import tqdm
import patoolib

In [2]:
def download_hook(t):
    last_b = [0]

    def update_to(b=1, bsize=1, tsize=None):
        if tsize is not None:
            t.total = tsize
        t.update((b - last_b[0]) * bsize)
        last_b[0] = b

    return update_to

In [3]:
def maybe_download(url, filename, data_folder, filesize=43467035):
    
    def download_file(url, filename, save_path):
        with tqdm.tqdm() as t:
            hook = download_hook(t)
            urllib.request.urlretrieve(url=url+filename, filename=file_save_path, reporthook=hook)
    
    
    file_save_path = os.path.join(data_folder, filename)
    
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    
    if os.path.exists(file_save_path):
        print("File already exists. Checking data integrity...")
        if os.path.getsize(file_save_path) == filesize:
            print("File is OK.")
        else:
            print("File seems to be changed. Downloading again!")
            os.remove(file_save_path)
            download_file(url, filename, file_save_path)
    else:
        print("File couldn't been found. Downloading now!")
        download_file(url, filename, file_save_path)

In [4]:
url = "http://www.kemik.yildiz.edu.tr/data/File/"
filename = "42bin_haber.rar"
data_folder = "raw_text_data"

In [5]:
maybe_download(url, filename, data_folder)

File already exists. Checking data integrity...
File is OK.


In [23]:
patoolib.extract_archive(os.path.join(data_folder, filename), verbosity=1, outdir=data_folder)

patool: Extracting raw_text_data\42bin_haber.rar ...
patool: running "C:\Program Files\WinRAR\rar.EXE" x -- C:\Users\fatih.barmanbay\Documents\word2vec\raw_text_data\42bin_haber.rar
patool:     with cwd=raw_text_data
patool: ... raw_text_data\42bin_haber.rar extracted to `raw_text_data'.


'raw_text_data'

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text.strip()

In [7]:
# This takes some time to complete
root_folder = os.path.join(data_folder, filename.partition('.rar')[0], 'news')
all_text = ""
for sub_folder in os.listdir(root_folder):
    for file in os.listdir(os.path.join(root_folder, sub_folder)):
        f = open(os.path.join(root_folder, sub_folder,  file), 'r', encoding="utf-8")
        text = f.read()
        text = clean_text(text)
        all_text = " ".join([all_text, text])
        f.close()
all_text = all_text.strip()

In [9]:
processed_file_path = "processed_text_data"
if not os.path.exists(processed_file_path):
    os.mkdir(processed_file_path)

with open(os.path.join(processed_file_path, "text"), 'w') as f:
    f.write(all_text)