In [1]:
# Libraries for scraping
from tqdm import tqdm as tqdm
import pandas as pd
import bs4 as bs
import requests
import time
import re
import os

In [2]:
class BTScraper:
    def __init__(self, page_from, page_to):
        self.page_from = page_from
        self.page_to = page_to
        
        self.base_url = 'https://bilimteknik.tubitak.gov.tr/arsiv/yillara-gore-arama?field_yil_ay_value%5Bvalue%5D%5Byear%5D=&page='
        self.page_urls = [self.base_url + str(i) for i in range(self.page_from, self.page_to)]
        self.issue_urls = []
        self.pdf_urls = []
        
        self.existing_files = []
        self.downloaded_files = []
        
        self.page_errors = []
        self.issue_errors = []
        self.download_errors = []
        
    ## Visits each page with number between page_from - page_to
    ## Finds issue urls in html eg. https://bilimteknik.tubitak.gov.tr/pdf/kasim-1968
    def get_issue_url(self):
        
        print('Extracting issue urls')
        time.sleep(1)
        
        for url in tqdm(self.page_urls):
            try:
                response = requests.get(url)
                soup = bs.BeautifulSoup(response.content, 'html.parser')
                divs = soup.find_all('div', class_='span4')
            
                adress_list = [div.a['href'] for div in divs if div.a != None]
            
                # Getting the links related with only pdf files
                match_list = [re.match(r'/pdf/', ref).string for ref in adress_list if re.match(r'/pdf/', ref) != None]
                issue_urls_temp = ['https://bilimteknik.tubitak.gov.tr' + match for match in match_list]
                
                self.issue_urls.extend(issue_urls_temp)
            except:
                self.page_errors.append(url)
                continue
            
            if len(self.page_errors) > 0:
                for error in self.page_errors:
                    self.page_urls.remove(error)
            
            time.sleep(0.01)
    
    ## Finds pdf urls in 
    def get_pdf_url(self):
        print('Extracting pdf urls')
        
        time.sleep(1)
        for issue in tqdm(self.issue_urls):
            try:
                response = requests.get(issue)
                soup = bs.BeautifulSoup(response.content, 'html.parser')
                main = soup.find(id='block-system-main')
                content = main.find_all('span', class_='field-content')
                
                self.pdf_urls.append(content[0].a.a['href'])
            except:
                self.issue_errors.append(issue)
                continue
        
        if len(self.issue_errors) > 0:    
            for error in self.issue_errors:
                self.issue_urls.remove(error)
            
            time.sleep(0.01)
    
    ## Creates a pandas DataFrame from url, year, month and filename data
    def create_data(self):
        for url in self.issue_urls:
            year_list = [re.findall('\d{4}', url)[0] for url in self.issue_urls]
            month_list = [re.findall('ocak|subat|mart|nisan|mayis|haziran|temmuz|agustos|eylul|ekim|kasim|aralik', url)[0] for url in self.issue_urls]
            path_list = ['pdf_files/' + year + '/' for year in year_list]
            filename_list = [month + '-' + year + '.pdf' for month, year in zip(month_list, year_list)]

        # Creating a DataFrame to keep it organized
        data_dict = {'year':year_list, 'month':month_list, 'path':path_list, 'filename':filename_list, 'url':self.pdf_urls}
        data = pd.DataFrame(data_dict)
        data.to_csv('url-data.txt', index=False) # Saving
        
        return data
    
    ## Downloads the pdfs and organizes them by year
    def download_pdf(self, data):
        print('Downloading')
        # pd.DataFrame.iterrows() does not work with tqdm
        # working around it
        row_list = []
        for row in data.iterrows():
            row_list.append(row)
            
        time.sleep(1)
        
        for row in tqdm(row_list):
        
            # Finding file and folder names from urls
            # to be organized
            filename = row[1][3]
            year = row[1][0]
            save_path = row[1][2]
            url = row[1][4]
            
            # Creating filepaths to save the pdfs
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))
        
            save_file = save_path + filename
          
            if (os.path.isfile(save_file)):
                # Do nothing if file exists
                self.existing_files.append(filename)
            else:
                # Download if the file does not exist
                try:
                    response = requests.get(url.strip())
                    self.downloaded_files.append(filename)
                except:
                    self.download_errors.append(url)
                    continue
                    
                with open(save_file, 'wb') as f:
                    f.write(response.content)
            time.sleep(0.01)
        
        print(f'{len(self.existing_files)} files already exist.')
        print(f'{len(self.downloaded_files)} files downloaded.')
        print(f'{len(self.download_errors) + len(self.issue_errors) + len(self.page_errors)} errors occured while downloading.')
        

In [3]:
# the issues from year 1999 starts at page 31
# select 31-53 to download only the files currently used in the project (1999-2020)

s = BTScraper(31, 53) # select the page interval you want to download, 0-53 for all pdf files (1967-2020)
s.get_issue_url()
s.get_pdf_url()
data = s.create_data()

Extracting issue urls


100%|██████████| 22/22 [00:56<00:00,  2.56s/it]


Extracting pdf urls


100%|██████████| 259/259 [11:18<00:00,  2.62s/it]


In [4]:
data = pd.read_csv('url-data.txt')
data.head()

Unnamed: 0,year,month,path,filename,url
0,1998,kasim,pdf_files/1998/,kasim-1998.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
1,1998,aralik,pdf_files/1998/,aralik-1998.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
2,1999,ocak,pdf_files/1999/,ocak-1999.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
3,1999,subat,pdf_files/1999/,subat-1999.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
4,1999,mart,pdf_files/1999/,mart-1999.pdf,https://bilimteknik.tubitak.gov.tr/system/file...


In [5]:
data.tail()

Unnamed: 0,year,month,path,filename,url
254,2020,ocak,pdf_files/2020/,ocak-2020.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
255,2020,subat,pdf_files/2020/,subat-2020.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
256,2020,mart,pdf_files/2020/,mart-2020.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
257,2020,nisan,pdf_files/2020/,nisan-2020.pdf,https://bilimteknik.tubitak.gov.tr/system/file...
258,2020,mayis,pdf_files/2020/,mayis-2020.pdf,https://bilimteknik.tubitak.gov.tr/system/file...


In [6]:
s = BTScraper(31, 53)
s.download_pdf(data)

Downloading


100%|██████████| 259/259 [00:02<00:00, 92.88it/s]

259 files already exist.
0 files downloaded.
0 errors occured while downloading.



