# 0_Scrape_Cartier

## 1. Import Libraries

In [1]:
# Import libaries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

## 2. Scrape Cartier Comments

We decided to scrape from watchuseek.com forum, because subreddit doesn't have enough comments. 

### 2.1. Scrape All Threads From All Pages (59 Pages)

In [2]:
# Set base and root urls
base_url = 'https://www.watchuseek.com/forums/cartier.481/'
root_url = 'https://www.watchuseek.com'

In [3]:
# Check response
response = requests.get(base_url)
response

<Response [200]>

In [4]:
# Initiate soup
soup = BeautifulSoup(response.text, 'lxml')

In [13]:
# CHANGE THIS - Set no of pages for main page
no_main_pages = 59

# Dictionary for thread links
thread_urls_dict = {}

# Get all thread links
for i in range(1,no_main_pages+1):
    
    # For page 1 use base_url
    if i == 1:
        response = requests.get(base_url)
        soup = BeautifulSoup(response.text, 'lxml')
        threads = soup.find_all('div', class_='california-thread-item')

        # Save urls in thread_urls
        thread_urls = []
        for thread_item in threads:
            thread_urls.append(thread_item.find('a', class_="thread-title--gtm")['href'])

        # Add thread urls to dict
        thread_urls_dict[i] = thread_urls
    
    # For other pages use https://www.watchuseek.com/forums/cartier.481/page-n format
    else:
        response = requests.get(base_url+'page-'+str(i))
        soup = BeautifulSoup(response.text, 'lxml')
        threads = soup.find_all('div', class_='california-thread-item')

        # Save urls in thread_urls
        thread_urls = []
        for thread_item in threads:
            thread_urls.append(thread_item.find('a', class_="thread-title--gtm")['href'])
        
        # Add thread urls to dict
        thread_urls_dict[i] = thread_urls
        
# Print thread_urls_dict        
thread_urls_dict

{1: ['/threads/welcome-to-our-new-cartier-forum.749568/',
  '/threads/cartier-movement-calibre-de-cartier-1904-mc.750205/',
  '/threads/valuation-and-authenticity-posts-please-read.749974/',
  '/threads/fake-vintage-cartier.5554745/',
  '/threads/wtb-cartier-santos-galbee-ref-1566-spare-links.5554519/',
  '/threads/fake-tank.5554481/',
  '/threads/cartier-santos-quick-release-rubber-straps.5551063/',
  '/threads/liger-quick-release-straps.5510253/',
  '/threads/vintage-cartier-tank-original.5553948/',
  '/threads/let%E2%80%99s-see-your-cartier.4682793/',
  '/threads/tank-must-replaced-dial-authentic.5553659/',
  '/threads/authentic-or-not-cartier-santos-galbee.5257630/',
  '/threads/cartier-vendome-trinity-authentic.5553093/',
  '/threads/help-for-cartier-panthere.5553171/',
  '/threads/crappy-tanks-from-the-70s-and-80s.5553058/',
  '/threads/legit-check-on-cartier-tank.5550682/',
  '/threads/seeking-advice-on-a-vintage-cartier-santos-dial-bubbles-and-value-concerns.5552389/',
  '/thre

We got all the threads link from 59 pages of forum. Now we will scrape the comments from each threads.

### 2.2 Scrape comments From Each threads

In [15]:
# List for comments data
comments_data = []

# Go through each thread url
for i, thread_urls in thread_urls_dict.items():
    for thread_url in thread_urls:
        thread_page_url = root_url + thread_url
        response = requests.get(thread_page_url)
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Scrap all comments
        comments = soup.find_all('div', class_="message-cell message-info-block")
        
        # Go through each comment
        for comment in comments:
            # Splitting the string to extract name and date, and ignoring the time
            parts = comment.find('div', class_="message-userContent lbContainer js-lbContainer")['data-lb-caption-desc'].split('·')
            name = parts[0].strip()
            date_time_part = parts[1].strip()
            date = ' '.join(date_time_part.split()[:3])

            # Save comment in comments_data
            comments_data.append({
                'body': comment.find('div' ,class_="bbWrapper", itemprop="text").text.replace('\n',''),
                'author': name,
                'date': date,
                'like': False
            })    

            # Check if there're likes on the comment
            if len(comment.find_all('bdi')) > 1:
                comments_data[0]['like'] = True
        
comments_data

[{'body': 'Watchuseek is proud and honored to introduce our new Cartier Forum. We strongly believe that we have many members and visitors with a special interest in this fine watch brand. The forum will be moderated by Hugh aka Athram.We wish you a pleasant and informative stay at our Cartier forum.',
  'author': 'Ernie Romers',
  'date': 'Sep 15, 2012',
  'like': True},
 {'body': "I'm glad you opened this forum!",
  'author': 'leicamaster',
  'date': 'Sep 15, 2012',
  'like': False},
 {'body': 'Yay!!Glad to see this powerhouse getting their own subforum.I really admire how Cartier has made itself a force to be reckoned with in the horological world despite having its roots in jewelry!',
  'author': 'HPoirot',
  'date': 'Sep 17, 2012',
  'like': False},
 {'body': 'Step 1:  Cartier on my forums. Step 2:  Cartier on my wrist.Step 1 complete!  Now for Step 2 ....Best wishes,Packleader',
  'author': 'Packleader',
  'date': 'Sep 22, 2012',
  'like': False},
 {'body': 'Thanks for this!  I ju

We got all comments from all threads from 59 pages. Now we will save it into json file for cleaning.

### 2.3 Save Comments To JSON File

In [16]:
# Save the comments to a JSON file
with open('../datasets/cartierwatches_long.json', 'w') as f:
    json.dump(comments_data, f, indent=4)

print("Comments have been saved to cartierwatches_long.json")

Comments have been saved to cartierwatches_long.json


In [17]:
# Check comments length
with open('../datasets/cartierwatches_long.json','r') as f:
    totalcomments = json.load(f)
    
len(totalcomments)

14517

We got around 14,500 comments from Cartier forum.