In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import os
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

In [163]:
data_dir = 'content'

In [164]:
garb = ['7725.dat', '14790.dat', '11173.dat', '9864.dat', '11767.dat', '11772.dat', '15058.dat',
        '26525.dat', '19349.dat', '10423.dat', '12234.dat', '20244.dat', '12431.dat', '24140.dat',
        '14518.dat', '7553.dat', '2629.dat', '6301.dat', '11510.dat', '27883.dat', '8710.dat',
        '26989.dat', '8841.dat', '10143.dat', '21542.dat', '6323.dat', '1115.dat', '20266.dat',
        '14060.dat', '6279.dat', '27040.dat', '8862.dat', '26202.dat', '24173.dat', '22728.dat',
        '13655.dat', '453.dat', '8861.dat', '13222.dat', '3952.dat', '11408.dat', '16953.dat', '14229.dat',
        '13787.dat', '4579.dat', '3377.dat', '1210.dat', '22011.dat', '6972.dat', '2268.dat']

empty = ['22728.dat', '11772.dat', '14790.dat', '11173.dat', '26525.dat', '12234.dat', '12431.dat', 
        '20244.dat', '14518.dat', '7553.dat', '2629.dat', '8710.dat', '1115.dat', '14060.dat', '6279.dat', 
        '27040.dat', '8862.dat' ,'26202.dat', '24173.dat', '13655.dat', '13222.dat', '11408.dat', '14229.dat', 
         '13787.dat', '4579.dat','22011.dat', '6972.dat', '2268.dat']

broken = ['7725.dat', '9864.dat','11767.dat', '15058.dat' ,'19349.dat' ,'10423.dat' ,'24140.dat' ,
        '11510.dat','27883.dat' ,'8841.dat','10143.dat','21542.dat','6323.dat','20266.dat','453.dat',
        '8861.dat', '3377.dat','1210.dat','3952.dat','26989.dat','6301.dat']

not_unique = ['16953.dat']

In [165]:
def get_description(filename):
    if filename == '.DS_Store':
        return
    with open(data_dir + '/' + filename) as f:
        
        dt = f.read().lower()
        
        dt = re.sub(r'\s+', ' ', dt)
        dt = re.sub(r'\&[a-zA-Z]{1,8}\;', ' ', dt)
        
        dt = re.sub(r'<[ ]*br[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*br[ ]*/[ ]*>', '', dt)
        dt = re.sub(r'<em>', '', dt)
        dt = re.sub(r'<\/em>', '', dt)
        dt = re.sub(r'<samp>', '', dt) 
        dt = re.sub(r'<\/div>', '', dt)
        dt = re.sub(r'<[ ]*small[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*strong[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*\/small[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*\/strong[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*p[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*/p[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*b[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*\/b[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*i[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*\/i[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*u[ ]*>', '', dt)
        dt = re.sub(r'<[ ]*\/u[ ]*>', '', dt)
        
        dt = re.sub(r'\[\/img\]', ' ', dt)
        dt = re.sub(r'\[attach\]', ' ', dt)
        
        dt = re.sub(r'\.ru\b', ' ', dt)
        dt = re.sub(r'\.com\b', ' ', dt)
        dt = re.sub(r'\.ua\b', ' ', dt)
        dt = re.sub(r'\.ру\b', ' ', dt)
        
        dt = re.sub(r'\s+lt\s+', ' ', dt)
        dt = re.sub(r'\s+gt\s+', ' ', dt)
        
        dt = re.sub(r'http\:\/\/[\w\-&\./?=\+;@#%]*', ' ', dt)
        dt = re.sub(r'https\:\/\/[\w\-&\./?=\+;@#%]*', ' ', dt)
        dt = re.sub(r'ftp\:\/\/[\w\-&\./?=\+;@#%]*', ' ', dt)
        dt = re.sub(r'www\.[\w\-&\./?=\+;@#%]*', ' ', dt)
        
        t = re.findall(r'<meta [^><]*name[^><]*=[^><]*"description[^>]*>', dt)
        
        with open('description/{}_desc.txt'.format(filename), 'w') as out:
            if len(t) != 0:
                if filename in broken:
                    tt = re.findall(r'content[^$]*', t[0])
                    if len(tt) > 0:
                        tt = tt[0]
                        tt = re.sub('content', '', tt)
                        print(tt, file=out)
                    else:
                        print('', file=out)
                elif filename not in empty:
                    if filename in not_unique:
                        t[0] = t[1] 
                    tt1 = re.findall(r'content[ ]*=[ ]*["][^"]*["]', t[0]) 
                    tt2 = re.findall(r'content[ ]*=[ ]*[\'][^\']*[\']', t[0]) 
                    tt3 = re.findall(r'pluginent[ ]*=[ ]*["][^"]*["]', t[0]) 
                    tt4 = re.findall(r'pluginent[ ]*=[ ]*[\'][^\']*[\']', t[0])
                    desc = [tt1, tt2, tt3, tt4]
                    non_zero_ind = np.argmax(np.array([len(tt1), len(tt2), len(tt3), len(tt4)]))
                    tt = desc[non_zero_ind][0]
                    tt = re.sub('content[ ]*=', '', tt)
                    tt = re.sub('pluginent[ ]*=', '', tt)
                    tt = tt.strip()
                    print(tt, file=out)
                else:
                    print('', file=out)

In [166]:
filenames = [filename for filename in os.listdir(data_dir)]

## Последовательная обработка

In [116]:
# for filename in tqdm(filenames):
#     get_description(filename)

100%|██████████| 28026/28026 [13:28<00:00, 34.67it/s] 


In [117]:
# print(cnt)

21482


## Параллельная обработка

In [167]:
from multiprocessing.dummy import Pool, Queue
import os 
data_dir = 'content' 

queue = Queue()
for filename in filenames:
    queue.put(filename)
    
queue.empty()

False

In [168]:
def process_page_wrapper(i):
    while not queue.empty():
        filename = queue.get()
        get_description(filename)

        with lock:
            pbar.update(1)
                
with Pool(processes=4) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

pool.join()

100%|██████████| 28026/28026 [20:03<00:00, 23.29it/s] 


## Идеи 

In [None]:
'''
- &quot; 3434
- &nbsp; 65
- &rarr; 1787
- &amp - 5480
- '[/img]' -> ' ' 900
- ссылки "http://scg.civfanatics.ru/sav//shogun-start-small.jpg - 2716 
- .ру .ua .ru  70
- [архив] 399
- [atach] 6163
- &laquo;  253 
- lt gt

'meta name="keywords"'

- h1
- вычленить хоть какой-то текст страницы для документов, где нет ни title, ни остального.
    Можно же [а-яА-Я]* ТОЛЬКО где пусто
    
'''

In [127]:
cnt = 0
for filename in tqdm(filenames):
    if filename == '.DS_Store':
        break
    with open(data_dir + '/' + filename) as f:
        dt = f.read().lower()
        if 'h1' in dt:
            cnt += 1

100%|██████████| 28026/28026 [02:43<00:00, 171.56it/s]
