In [44]:
from bs4 import BeautifulSoup
import requests
import tqdm
import pandas as pd
import datetime

# scrape all pages
page = 0
while True:
    try:
        # scrape each page (24 games per page)
        url = f'https://store.playstation.com/en-hk/category/29696e1b-a942-4832-935d-ebd11b163263/{25}'
        result = requests.get(url).text
        doc = BeautifulSoup(result, 'html.parser')
        all_games = doc.find('ul',{'class':'psw-grid-list'}).find_all('li') # list containing all 24 games
        
        break
    except: # scrape till the end of page
        break
    

In [47]:
DiscountedGame(all_games[0]).return_dict()

{'Title': 'Super Nero/Super Dante/Super Vergil (English/Chinese/Japanese Ver.)',
 'Publisher': 'CAPCOM ASIA',
 'Link': 'https://store.playstation.com/en-hk/product/JP0102-CUSA01599_00-ASIARDOBXXXX0002',
 'DiscountPCT': '50%',
 'OriginalPrice': 'HK$39.00',
 'DiscountPrice': 'HK$19.50',
 'Discount_Endtime': '14/2/2024 03:59 PM UTC',
 'Rating': '4.66',
 'Rating_count': '215',
 'Genre': 'Action',
 'ReleaseDate': '18/6/2015'}

In [46]:
class DiscountedGame():
    def __init__(self, html):
        self.html = html
        self.link = self.get_link()
        self.pagehtml = self.get_pagehtml()
        self.title = self.get_title()
        self.publisher = self.get_publisher()
        self.discount_pct = self.get_discountpct()
        self.original_price = self.get_originalprice()
        self.discounted_price = self.get_discountedprice()
        self.discount_endtime = self.get_discountendtime()
        self.rating = self.get_rating()
        self.rating_num = self.get_ratingnum()
        # self.datetime = datetime.now().strftime('%d/%-m/%Y')
        self.genre = self.get_genre()
        self.releasedate = self.get_releasedate()
        
    def get_link(self):
        try:
            link = 'https://store.playstation.com' + self.html.find('a',{'class': 'psw-link'}).get('href')
        except:
            return None
        return link

    def get_pagehtml(self):
        try:
            url = self.link
            result = requests.get(url).text
            doc = BeautifulSoup(result, 'html.parser')
            return doc
        except:
            return None

    def get_title(self):
        try:
            title = self.pagehtml.find('h1',{'class': 'psw-t-title-l'}).text
        except:
            return None
        return title

    def get_publisher(self):
        try:
            publisher = self.pagehtml.find('div',{'data-qa': 'mfe-game-title#publisher'}).text
        except:
            return None
        return publisher

    def get_discountpct(self):
        try:
            discountpct = self.pagehtml.find('span',{'data-qa': 'mfeCtaMain#offer0#discountInfo'}).text.split()[-1]
        except:
            return None
        return discountpct

    def get_discountendtime(self):
        try:
            discountendtime = ' '.join(self.pagehtml.find('span',{'data-qa': 'mfeCtaMain#offer0#discountDescriptor'}).text.split()[2:])
        except:
            return None
        return discountendtime
        
    def get_originalprice(self):
        try:
            originalprice = self.pagehtml.find('span',{'data-qa': 'mfeCtaMain#offer0#originalPrice'}).text
        except:
            return None
        return originalprice

    def get_discountedprice(self):
        try:
            discountedprice = self.pagehtml.find('span',{'data-qa': 'mfeCtaMain#offer0#finalPrice'}).text
        except:
            return None
        return discountedprice

    def get_rating(self):
        try:
            rating = self.pagehtml.find('div',{'data-qa': 'mfe-game-title#average-rating'}).text
        except:
            return None
        return rating

    def get_ratingnum(self):
        try:
            ratingnum = self.pagehtml.find('div',{'data-qa': 'mfe-game-title#rating-count'}).text.split()[0]
        except:
            return None
        return ratingnum

    def get_genre(self):
        try:
            genre = self.pagehtml.find('dd',{'data-qa': 'gameInfo#releaseInformation#genre-value'}).text
        except:
            return None
        return genre
    def get_releasedate(self):
        try:
            release_date = self.pagehtml.find('dd',{'data-qa': 'gameInfo#releaseInformation#releaseDate-value'}).text
        except:
            return None
        return release_date
        
    def return_dict(self):
        dict_object = {
            'Title': self.title,
            'Publisher': self.publisher,
            'Link': self.link,
            'DiscountPCT': self.discount_pct,
            'OriginalPrice': self.original_price,
            'DiscountPrice': self.discounted_price,
            # 'Discount_Startdate': self.datetime,
            'Discount_Endtime': self.discount_endtime,
            'Rating': self.rating,
            'Rating_count': self.rating_num,
            'Genre': self.genre,
            'ReleaseDate': self.releasedate,
            
        }
        
        return dict_object

In [None]:
from bs4 import BeautifulSoup
import requests
import tqdm
import pandas as pd
from datetime import datetime
page = 0
metadata = []

# scrape all metadata 
while True:
    print(page, end=' ')
    try:
        # scrape each page (24 games per page)
        url = f'https://store.playstation.com/en-hk/category/29696e1b-a942-4832-935d-ebd11b163263/{page}'
        result = requests.get(url).text
        doc = BeautifulSoup(result, 'html.parser')
        all_games = doc.find('ul',{'class':'psw-grid-list'}).find_all('li') # list containing parsed html of all 24 games
        
        # hit final page
        if len(all_games) == 0:
            break
            
        # scrape content in each page
        for html in all_games:
            game_data = DiscountedGame(html).return_dict()
            metadata.append(game_data)
  
    except: # scrape till the end of page
        break
    
    page += 1

df = pd.DataFrame(metadata)
df.to_csv("game_data.csv")

In [5]:
df = pd.DataFrame(metadata)
df.to_csv("game_data.csv")

In [2]:
import pandas as pd
all_deals_df = pd.read_csv('historic_deals.csv')

In [3]:
all_deals_df

Unnamed: 0,Title,Publisher,Link,DiscountPCT,OriginalPrice,DiscountPrice,Discount_Endtime,Rating,Rating_count,Genre,ReleaseDate
0,"STAR WARS Jedi: Survivor™ (Simplified Chinese,...",Electronic Arts Inc.,https://store.playstation.com/en-hk/product/UP...,55%,HK$549.00,HK$247.05,14/2/2024 02:59 PM UTC,4.51,9.4k,Action,28/4/2023
1,EA SPORTS FC™ 24 Standard Edition PS4 & PS5 (S...,EA Swiss Sarl,https://store.playstation.com/en-hk/product/EP...,65%,HK$549.00,HK$192.15,14/2/2024 03:59 PM UTC,3.48,51k,Sport,28/9/2023
2,Hogwarts Legacy PS5 Version (Simplified Chines...,Warner Bros. Interactive,https://store.playstation.com/en-hk/product/UP...,40%,HK$548.00,HK$328.80,14/2/2024 03:59 PM UTC,4.50,29k,Unique,9/2/2023
3,Hogwarts Legacy PS4 Version (Simplified Chines...,Warner Bros. Interactive,https://store.playstation.com/en-hk/product/UP...,40%,HK$468.00,HK$280.80,14/2/2024 03:59 PM UTC,4.50,29k,Unique,4/5/2023
4,Hogwarts Legacy: Dark Arts Pack (English/Chine...,Warner Bros. Interactive,https://store.playstation.com/en-hk/product/UP...,40%,HK$160.00,HK$96.00,14/2/2024 03:59 PM UTC,5.00,7,Unique,9/2/2023
...,...,...,...,...,...,...,...,...,...,...,...
2598,"Vostok 2061 (Simplified Chinese, English, Kore...",BIG WAY GAMES,https://store.playstation.com/en-hk/product/HP...,30%,HK$78.00,HK$54.60,21/2/2024 03:59 PM UTC,3.71,14,,
2599,Wayward Strand (English),Ghost Pattern Pty Ltd,https://store.playstation.com/en-hk/product/EP...,30%,HK$148.00,HK$103.60,21/2/2024 03:59 PM UTC,4.55,20,,
2600,Whispike Survivors - Sword of the Necromancer ...,Grimorio of Games S.L.,https://store.playstation.com/en-hk/product/HP...,30%,HK$18.00,HK$12.60,21/2/2024 03:59 PM UTC,2.71,76,,
2601,Young Valkyries 2 (English),BIG WAY GAMES,https://store.playstation.com/en-hk/product/HP...,30%,HK$18.00,HK$12.60,21/2/2024 03:59 PM UTC,3.73,11,,


In [6]:
all_deals_df.iloc[:100,:].to_csv('historic_deals.csv',index=False)

In [10]:
from datetime import datetime
datetime.now().strftime("%d/%m/%Y")


'13/02/2024'

In [12]:
a = datetime.now()

In [16]:
(datetime.now()-a).total_seconds()

42.615453