In [4]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
from time import sleep

In [5]:
base_url = 'https://www.coffeereview.com/advanced-search/page/'

In [6]:
pages = 127

In [7]:
url_links = []
for page in range(1,pages+1): 
    url_links.append(base_url+str(page))
url_links[50:60]

['https://www.coffeereview.com/advanced-search/page/51',
 'https://www.coffeereview.com/advanced-search/page/52',
 'https://www.coffeereview.com/advanced-search/page/53',
 'https://www.coffeereview.com/advanced-search/page/54',
 'https://www.coffeereview.com/advanced-search/page/55',
 'https://www.coffeereview.com/advanced-search/page/56',
 'https://www.coffeereview.com/advanced-search/page/57',
 'https://www.coffeereview.com/advanced-search/page/58',
 'https://www.coffeereview.com/advanced-search/page/59',
 'https://www.coffeereview.com/advanced-search/page/60']

In [8]:
link = url_links[0]
response = requests.get(link)

In [9]:
response.status_code
response.content

b'<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n<hr><center>nginx</center>\r\n</body>\r\n</html>\r\n'

In [10]:
headers = {'User-agent': 'Chrome'}

response = requests.get(link, headers = headers)
response.status_code

200

In [11]:
soup = BeautifulSoup(response.content, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [12]:
soup.title.text

"Coffee Review - The World's Leading Coffee Guide"

In [13]:
len(soup.findAll('div', {'class': 'entry-content'}))

21

In [14]:
headers = {'User-agent': 'Chrome'}
coffee_links = []

for link in tqdm(url_links):
    response = requests.get(link, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    boxes = soup.findAll('div', {'class': 'entry-content'})
    for box in boxes[1:]:
        coffee_links.append(box.find('p', {'class': 'review-roaster'}).find('a').get('href'))

  0%|          | 0/127 [00:00<?, ?it/s]

100%|██████████| 127/127 [02:53<00:00,  1.37s/it]


In [15]:
len(coffee_links)

2532

In [16]:
df = pd.DataFrame()

In [17]:
# Loop through the first 50 links in the list 'coffee_links'
for link in tqdm(coffee_links):
    # Send a GET request to each link with specified headers
    response = requests.get(link, headers=headers)

    # Check if the response status is OK (status code 200)
    if response.status_code == 200:
        # Parse the response content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try to find and extract the rating from the page
        try:
            rating = soup.find('span', {'class': 'review-template-rating'}).text
        except:
            # If rating is not found, assign NaN
            rating = np.nan
            
        # Extract the roaster and coffee name from the page
        review_roaster = soup.find('p', {'class': 'review-roaster'}).text
        coffee = soup.find('h1', {'class': 'review-title'}).text

        # Find all tables
        tables = soup.findAll('table', {'class': 'review-template-table'})
        
        # Initialize an empty dictionary to hold data from the tables
        data_dict = {}
        for table in tables:
            # Find all 'td' elements (table data) in each table
            temp_table = table.findAll('td')
            for i in range(0, len(temp_table) - 1, 2):
                # For each pair of 'td' elements, the first is key and the second is value
                key = temp_table[i].text.strip()
                value = temp_table[i + 1].text.strip()

                # Add the key-value pair to the data dictionary
                data_dict[key] = value
                
        # Create a dictionary with the extracted information
        temp = {
            'date': datetime.now().date(),
            'review_roaster': review_roaster,
            'coffee': coffee,
            'rating': rating,
            'link': link
        }
        
        # Update the temporary dictionary with data from tables
        temp.update(data_dict)
        
        # Add the extracted information as a new row to the dataframe
        df = pd.concat([df, pd.DataFrame([temp])], axis=0, ignore_index=True)

        # Pause execution for 1.2 seconds to avoid overwhelming the server
        sleep(1.2)
    else:
        # Print a message if the server is down or access is blocked
        print('Server is down or blocked!')

  0%|          | 0/2532 [00:00<?, ?it/s]

100%|██████████| 2532/2532 [1:32:23<00:00,  2.19s/it]


In [19]:
csv_file_path = 'C:/Users/dimit/OneDrive/Desktop/git_projects/Coffee-review/df_scrapped.csv'

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

In [20]:
df 

Unnamed: 0,date,review_roaster,coffee,rating,link,Roaster Location:,Coffee Origin:,Roast Level:,Agtron:,Est. Price:,Review Date:,Aroma:,Body:,Flavor:,Aftertaste:,With Milk:,Acidity/Structure:,Acidity:
0,2024-01-04,Utopian Coffee,Cold Brew,93,https://www.coffeereview.com/review/cold-brew-8/,"Fort Wayne, Indiana",Not disclosed,Light,64/85,$18.00/12 ounces,January 2024,8,9,9,8,9,,
1,2024-01-04,JBC Coffee Roasters,Nkanda #7 Burundi,92,https://www.coffeereview.com/review/nkanda-7-b...,"Madison, Wisconsin","Tangara, Ngozi, Burundi",Medium-Light,58/78,$21.00/12 ounces,January 2024,8,9,9,8,,8,
2,2024-01-04,Utopian Coffee,Pearl,91,https://www.coffeereview.com/review/pearl/,"Fort Wayne, Indiana",Not disclosed,Light,64/86,$18.00/12 ounces,January 2024,8,8,9,7,,9,
3,2024-01-04,Kafe Coffee Roastery,Peach-C Blend,94,https://www.coffeereview.com/review/peach-c-bl...,"Zhubei, Taiwan",Ethiopia; Colombia,Light,62/84,NT$ 366/8 ounces,January 2024,9,9,9,8,,9,
4,2024-01-04,Cafe Fugu Roasters,Kanazawa Light Roasted Espresso House Blend,92,https://www.coffeereview.com/review/kanazawa-l...,"Taipei City, Taiwan",Not disclosed,Light,65/86,NT$ 450/200 grams,January 2024,8,9,9,8,8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2527,2024-01-04,Café Grumpy,Honduras Las Flores Parainema,94,https://www.coffeereview.com/review/honduras-l...,"Brooklyn, New York","Santa Bárbara, Honduras",Medium-Light,56/80,$22.00/12 ounces,January 2019,9,9,9,8,,9,
2528,2024-01-04,JBC Coffee Roasters,Kivu DR Congo,94,https://www.coffeereview.com/review/kivu-dr-co...,"Madison, Wisconsin","Kalehe, South Kivu Province, Democratic Republ...",Medium-Light,56/78,$16.90/12 ounces,January 2019,9,9,9,8,,9,
2529,2024-01-04,Coava Coffee Roasters,Santa Luzia Brazil,93,https://www.coffeereview.com/review/santa-luzi...,"Portland, Oregon","Cerrado Mineiro growing region, Santa Luzia, B...",Medium-Light,54/78,$15.00/250 grams,January 2019,9,9,9,8,,8,
2530,2024-01-04,Coava Coffee Roasters,Porfirio Castellanos Honduras,93,https://www.coffeereview.com/review/porfirio-c...,"Portland, Oregon","Santa Bárbara, Honduras",Medium-Light,56/80,$15.00/250 grams,January 2019,9,8,9,8,,9,
