### Import


In [4]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd

In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/981.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=ef3df2c26dd7a6d0df66e933e446ea454439be5528db227e46fd7763366b2ca4
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711

### URL

In [5]:
# base URL for the Trustpilot page

base_url = "https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency"
#base_url = "https://www.trustpilot.com/review/www.hotelsone.com"
#base_url = "https://www.trustpilot.com/review/www.galahotels.com"


# headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

output_file = "./scrapped_reviews/trustpilot_reviews_1.csv"
#output_file = "./scrapped_reviews/trustpilot_reviews_2.csv"
#output_file = "./scrapped_reviews/trustpilot_reviews_3.csv"

### Functions

In [6]:
# function to scrape a single page
def scrape_page(url, page_num):
    print(f"Scraping page {page_num}: {url}")
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to load page {page_num}. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    # find all review containers
    reviews = soup.find_all("article", class_="paper_paper__1PY90")  # Update class name if structure changes

    # store extracted reviews
    scraped_reviews = []

    for review in reviews:
        # Extract review title
        title_tag = review.find("h2", attrs={"data-service-review-title-typography": "true"})
        title = title_tag.text.strip() if title_tag else "No title"

        # Extract review body
        body_tag = review.find("p", attrs={"data-service-review-text-typography": "true"})
        body = body_tag.text.strip() if body_tag else "No body text"

        # Extract review date
        date_tag = review.find("p", attrs={"data-service-review-date-of-experience-typography": "true"})
        date = date_tag.text.strip() if date_tag else "No date"

        # Append the review data
        scraped_reviews.append({
            "title": title,
            "body": body,
            "date": date
        })

    return scraped_reviews



# Main function to scrape multiple pages
def scrape_multiple_pages(base_url, max_pages):
    all_reviews = []

    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "body", "date"])
        writer.writeheader()

        for page_num in range(1, max_pages + 1):
            page_url = f"{base_url}?page={page_num}"
            reviews = scrape_page(page_url, page_num)

            all_reviews.extend(reviews)

            # write to CSV after every page
            writer.writerows(reviews)

            if not reviews:
                print(f"No more reviews found. Stopping at page {page_num}.")
                break

            time.sleep(2)

    print(f"Scraping complete. {len(all_reviews)} reviews saved to {output_file}.")
    return all_reviews

# Scrape up to 250 pages
max_pages = 250
scraped_reviews = scrape_multiple_pages(base_url, max_pages)


Scraping page 1: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=1
Scraping page 2: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=2
Scraping page 3: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=3
Scraping page 4: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=4
Scraping page 5: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=5
Scraping page 6: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=6
Scraping page 7: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=7
Scraping page 8: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=8
Scraping page 9: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=9
Scraping page 10: https://www.trustpilot.com/review/www.hrs.de?languages=all&sort=recency?page=10
Scraping page 11: https://www.trustpil

### RSE multilanguage

In [7]:
import csv
from langdetect import detect

# Define RSE related keywords for multiple languages
keywords = {
    "en": {
        "positive": ["eco-friendly", "sustainable", "green", "energy", "fair", "wages", "renewable", "recycling", "ethical", "carbon", "neutral", "clean", "organic", "friendly", "low", "emissions", "initiatives", "diversity", "community", "local", "efficiency", "responsible"],
        "negative": ["greenwashing", "unethical", "unsustainable", "wasteful", "exploitation", "pollution", "bad", "conditions", "high", "emissions", "deforestation", "toxic", "violations", "child", "labor", "lack", "transparency", "damage"]
    },
    "fr": {
        "positive": ["éco", "responsable", "durable", "énergie", "verte", "salaires", "équitables", "renouvelable", "recyclage", "éthique", "carbone", "neutre", "propre", "bio", "respectueux", "environnement", "basses", "émissions", "initiatives", "diversité", "communauté", "locale", "efficacité", "tourisme"],
        "negative": ["écoblanchiment", "non éthique", "durable", "gaspillage", "exploitation", "pollution", "mauvaises", "conditions", "élevées", "émissions", "déforestation", "toxique", "violations", "droits", "enfants", "manque", "transparence", "dégradations"]
    },
    "es": {
        "positive": ["ecológico", "sostenible", "energía", "verde", "salarios", "justos", "renovable", "reciclaje", "ético", "carbono", "neutral", "limpia", "orgánico", "respetuoso", "medio", "ambiente", "bajas", "emisiones", "iniciativas", "diversidad", "comunidad", "local", "eficiencia", "turismo"],
        "negative": ["lavado", "verde", "no ético", "sostenible", "desperdicio", "explotación", "contaminación", "malas", "condiciones", "altas", "emisiones", "deforestación", "tóxico", "violaciones", "derechos", "infantil", "falta", "transparencia", "daños"]
    },
    "de": {
        "positive": ["umwelt", "freundlich", "nachhaltig", "grün", "energie", "faire", "löhne", "erneuerbar", "recycling", "ethisch", "klima", "neutral", "sauber", "bio", "freundlich", "niedrige", "emissionen", "initiativen", "vielfalt", "gemeinschaft", "lokal", "effizienz", "tourismus"],
        "negative": ["greenwashing", "unethisch",  "nachhaltig", "verschwendung", "ausbeutung", "verschmutzung", "schlechte", "bedingungen", "hohe", "emissionen", "abholzung", "giftig", "menschen", "rechte", "kinderarbeit", "mangel", "transparenz", "schäden"]
    },
    "pt": {
        "positive": ["ecológico", "sustentável", "energia", "verde", "salários", "justos", "renovável", "reciclagem", "ético", "carbono", "neutro", "limpa", "orgânico", "amigável", "meio", "ambiente", "baixas", "emissões", "iniciativas", "diversidade", "comunidade", "local", "eficiência", "turismo"],
        "negative": ["greenwashing", "antiético", "sustentável", "desperdício", "exploração", "poluição", "más", "condições", "altas", "emissões", "desmatamento", "tóxico", "violaciones", "direitos", "infantil", "falta", "transparência", "danos"]
    },
    "it": {
        "positive": ["ecologico", "sostenibile", "energia", "verde", "salari", "equi", "rinnovabile", "riciclaggio", "etico", "carbonio", "neutro", "pulita", "bio", "rispettoso", "ambiente", "basse", "emissioni", "iniziative", "diversità", "comunità", "locale", "efficienza", "turismo"],
        "negative": ["greenwashing", "non etico", "sostenibile", "spreco", "sfruttamento", "inquinamento", "cattive", "condizioni", "alte", "emissioni", "deforestazione", "tossico", "violazioni", "diritti", "minori", "mancanza", "trasparenza", "danni"]
    }
}


input_file = "./scrapped_reviews/trustpilot_reviews_1.csv"  # Input CSV file with scraped reviews
output_file = "./test_classification/trustpilot_rse_classified_reviews_multilingual_1.csv"  # Output CSV file with classifications

# Function to classify reviews based on RSE keywords
def classify_review(text, language):
    text_lower = text.lower()  # Convert to lowercase for case insensitive matching
    lang_keywords = keywords.get(language, None)  # Get keywords for the detected language

    if lang_keywords:
        if any(keyword in text_lower for keyword in lang_keywords["positive"]):
            return "Positive"

        if any(keyword in text_lower for keyword in lang_keywords["negative"]):
            return "Negative"

    return "Neutral"

# Read reviews from the input file and classify them
classified_reviews = []

with open(input_file, mode="r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        review_body = row["body"]  # The review text column

        # Detect the language of the review
        try:
            language = detect(review_body)
        except:
            language = "unknown"

        # Classify the review based on the detected language
        classification = classify_review(review_body, language)
        row["language"] = language  # Add detected language to the row
        row["classification"] = classification  # Add classification to the row
        classified_reviews.append(row)

# Write the classified reviews to a new CSV file
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    fieldnames = ["title", "body", "date", "language", "classification"]  # Include language and classification columns
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(classified_reviews)

print(f"Multilingual RSE classification complete. Results saved to {output_file}.")


Multilingual RSE classification complete. Results saved to ./test_classification/trustpilot_rse_classified_reviews_multilingual_1.csv.


In [9]:
df = pd.read_csv("./test_classification/trustpilot_rse_classified_reviews_multilingual_1.csv")
df1 = df[df["classification"]=="Negative"]
df1["body"]

5       I found this site using ChatGPT (I've tried to...
14      Hotel didn't guarantee my reservation.  They d...
25      I found this site using ChatGPT (I've tried to...
34      Hotel didn't guarantee my reservation.  They d...
45      I found this site using ChatGPT (I've tried to...
                              ...                        
4954    Hotel didn't guarantee my reservation.  They d...
4965    I found this site using ChatGPT (I've tried to...
4974    Hotel didn't guarantee my reservation.  They d...
4985    I found this site using ChatGPT (I've tried to...
4994    Hotel didn't guarantee my reservation.  They d...
Name: body, Length: 500, dtype: object

In [11]:
df = pd.read_csv("./test_classification/trustpilot_rse_classified_reviews_multilingual_1.csv")
df1 = df[df["classification"]=="Positive"]
df1["body"]

3       This was a great hotel. very comfortable and p...
4       Friendly staff, good location, clean and spaci...
12      Initially changed to HRS to support a local bu...
16      This site has often special offers for high qu...
23      This was a great hotel. very comfortable and p...
                              ...                        
4976    This site has often special offers for high qu...
4983    This was a great hotel. very comfortable and p...
4984    Friendly staff, good location, clean and spaci...
4992    Initially changed to HRS to support a local bu...
4996    This site has often special offers for high qu...
Name: body, Length: 1000, dtype: object