# TORI WEB SCRAPER AND DATA ANALYZER V1.0

This is a script that will get prices from each gpu on tori and afterwards analyze them. 

## 1. Simple title and price scraping method

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import os

base_url = 'https://www.tori.fi/uusimaa/tietokoneet_ja_lisalaitteet/komponentit?ca=18&cg=5030&c=5038&ps=1&st=s&st=k&st=u&st=h&st=g&com=graphic_card&w=1&o='
filename = 'tori_fi_GPUs_10pg.csv'

def scrape_description(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        description_tag = soup.find(itemprop='description')
        return description_tag.text.strip() if description_tag else 'No description'
    else:
        print(f"Failed to retrieve page {url}: {response.status_code}")
        return 'No description'

def scrape_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        titles = soup.find_all('div', class_='li-title')
        prices = soup.find_all('p', class_='list_price')
        
        links = [a['href'] for a in soup.find_all('a', href=True) if '/ilmoitus/' in a['href']]
        
        data = []
        for title, price, link in zip(titles, prices, links):
            description = scrape_description('https://www.tori.fi' + link)
            data.append((title.text.strip(), price.text.strip(), description))
        
        return data
    
    else:
        print(f"Failed to retrieve page {url}: {response.status_code}")
        return []

file_exists = os.path.isfile(filename)

with open(filename, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    if not file_exists:
        writer.writerow(['Title', 'Price', 'Description'])
        
    for page_number in range(2, 11):
        url = base_url + str(page_number)
        print(f"Scraping page {page_number}")
        
        for data in scrape_page(url):
            writer.writerow(data)


Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10


In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import os

base_url = 'https://www.tori.fi/uusimaa/tietokoneet_ja_lisalaitteet/komponentit?ca=18&cg=5030&c=5038&ps=1&st=s&st=k&st=u&st=h&st=g&com=graphic_card&w=1&o='
filename = 'tori_fi_GPUs_1stPg.csv'  # Changed the filename to represent that it's only the 1st page.

def scrape_description(url):
    print(f"Scraping Description from {url}")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        description_tag = soup.find(itemprop='description')
        if description_tag:
            return description_tag.text.strip()
        else:
            print(f"Description not found for {url}")
            return 'No description'
    else:
        print(f"Failed to retrieve description page {url}: {response.status_code}")
        return 'No description'

def scrape_page(url):
    print(f"Scraping Page: {url}")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        titles = soup.find_all('div', class_='li-title')
        if not titles:
            print(f"Titles not found on page {url}")
        
        prices = soup.find_all('p', class_='list_price')
        if not prices:
            print(f"Prices not found on page {url}")
        
        links = [a['href'] for a in soup.find_all('a', href=True) if '/ilmoitus/' in a['href']]
        if not links:
            print(f"Links not found on page {url}")
        
        data = []
        for title, price, link in zip(titles, prices, links):
            description = scrape_description('https://www.tori.fi' + link)
            data.append((title.text.strip(), price.text.strip(), description))
        
        return data
    else:
        print(f"Failed to retrieve page {url}: {response.status_code}")
        return []

file_exists = os.path.isfile(filename)

with open(filename, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    if not file_exists:
        writer.writerow(['Title', 'Price', 'Description'])
        
    # Only scrapes the first page
    url = base_url + '1'
    print(f"Scraping page 1")
        
    for data in scrape_page(url):
        writer.writerow(data)


Scraping page 1
Scraping Page: https://www.tori.fi/uusimaa/tietokoneet_ja_lisalaitteet/komponentit?ca=18&cg=5030&c=5038&ps=1&st=s&st=k&st=u&st=h&st=g&com=graphic_card&w=1&o=1
Links not found on page https://www.tori.fi/uusimaa/tietokoneet_ja_lisalaitteet/komponentit?ca=18&cg=5030&c=5038&ps=1&st=s&st=k&st=u&st=h&st=g&com=graphic_card&w=1&o=1


## Description fetcher


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time  # Importing the time module

base_url = 'https://www.tori.fi/uusimaa/tietokoneet_ja_lisalaitteet/komponentit?ca=18&cg=5030&c=5038&ps=1&st=s&st=k&st=u&st=h&st=g&com=graphic_card&w=1&o=1'
filename = 'tori_fi_GPUs_descriptions.csv'

def get_description(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        description = soup.find('div', class_='body', itemprop='description')
        return description.text.strip() if description else 'Description not found'
    else:
        print(f"Failed to retrieve description from {link}: {response.status_code}")
        return 'Failed to retrieve description'

def scrape_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.select('div.list_mode_thumb a[href]')]  # Adjusted the selector
        
        descriptions = []
        for link in links:
            full_link = 'https://www.tori.fi' + link
            descriptions.append(get_description(full_link))
            time.sleep(3)  # 3 seconds delay between each description fetch
        return descriptions
    else:
        print(f"Failed to retrieve page {url}: {response.status_code}")
        return []

file_exists = os.path.isfile(filename)

with open(filename, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write header if file didn't exist
    if not file_exists:
        writer.writerow(['Description'])
        
    # Call the function to scrape the page
    descriptions = scrape_page(base_url)
    for description in descriptions:
        writer.writerow([description])
