# Data fetching and preprocessing 

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [4]:
data = {'name': [], 'brand': [], 'model': [], 'memory size': [], 'memory type': [], 'price': []}

In [5]:
ramRegEx = re.compile(r' \d{1,2}GB?')
gddrRegEx = re.compile(r'(\wDDR\w*) | (\wDRR\w*) | (DDR\w*) | (DRR\w*)')
modelRegEx = re.compile(r'RTX ?™ ?[0-Z]+|RTX ?[0-Z]+|GTX ?[0-Z]+|GT[0-9]+|RX[0-Z]+|ARC A[0-9]+')

## Web scraper

This collects all graphics cards product information listed in the website of PC Express. Its web pages are saved locally.

In [6]:
for i in range(1, 3):
    if i == 1:
        with open(f'Graphics Cards - PC Express.html', 'r', encoding='utf8') as html_file:
            soup = BeautifulSoup(html_file, 'html.parser')
            
    else:
        with open(f'Graphics Cards - PC Express - Page {i}.html', 'r', encoding='utf8') as html_file:
            soup = BeautifulSoup(html_file, 'html.parser')
        
    gpu_items = soup.find_all(class_='product-grid-item')
    
    for gpu_item in gpu_items:
        name = gpu_item.find(class_='product-title').get_text()

        
        brand = gpu_item.find(class_='woodmart-product-brands-links').get_text()
        price = gpu_item.find(class_='price').get_text()
    
        price = price.replace('₱', '')
        price = price.replace(',', '')
        price = float(price)
        
        model = modelRegEx.search(name)
        model = model.group().strip()
        
        
        ram = ramRegEx.search(name)
        ram = ram.group().strip()
        
        gddr = gddrRegEx.search(name)
        gddr = gddr.group().strip()
        
        data['name'].append(name)
        data['brand'].append(brand)
        data['price'].append(price)
        data['memory size'].append(ram)
        data['memory type'].append(gddr)
        data['model'].append(model)

In [7]:
gpu_df = pd.DataFrame(data)
gpu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         72 non-null     object 
 1   brand        72 non-null     object 
 2   model        72 non-null     object 
 3   memory size  72 non-null     object 
 4   memory type  72 non-null     object 
 5   price        72 non-null     float64
dtypes: float64(1), object(5)
memory usage: 3.5+ KB


In [8]:
gpu_df.head()

Unnamed: 0,name,brand,model,memory size,memory type,price
0,INTEL ARC A750 (INTEL-21P02J00BA-99AM3D) 8GB G...,Intel,ARC A750,8GB,GDDR6,15700.0
1,ZOTAC RTX4070TI TRINITY OC 12GB GDDR6X 192-bit...,Zotac,RTX4070TI,12GB,GDDR6X,58450.0
2,ZOTAC RTX3060 TWIN EDGE 12GB GDDR6 192-bit Gra...,Zotac,RTX3060,12GB,GDDR6,20500.0
3,ZOTAC RTX3050 TWIN EDGE OC 8GB GDDR6 128-bit G...,Zotac,RTX3050,8GB,GDDR6,16500.0
4,ZOTAC GTX1650 AMP CORE 4GB GDDR6 128-bit Graph...,Zotac,GTX1650,4GB,GDDR6,9650.0


## Cleaning the data

A lot of these data contain unneccessary characters such as *™* or *®*. There are also some missing characters in the product information. Let's try to clean the columns `model`, `vram`, and `gddr`

In [9]:
gpu_df['model'].value_counts()

model
GTX 1650      8
RTX4070       8
RTX™ 3050     7
GTX1650       6
RTX3060TI     5
RTX3050       4
RTX4080       4
RTX3060       3
GTX 1050T     2
GTX1630       2
RTX4090       2
GTX 1660      2
RX6400        1
GT1030        1
RTX ™ 3050    1
RTX ™ 3090    1
RX6500XT      1
RTX 2060      1
GTX 1660T     1
ARC A750      1
RTX™ 4080     1
RTX™ 3060     1
RTX™ 3060T    1
RTX4070TI     1
RTX™ 4070T    1
GT730         1
RX560         1
GTX1660       1
GTX1050TI     1
RX6600XT      1
RTX™3060      1
Name: count, dtype: int64

In [10]:
gpu_df['model'] = gpu_df['model'].map({'GTX 1650': 'GTX 1650', 'GTX1650': 'GTX 1650', 'RTX4070': 'RTX 4070', 'RTX™ 3050': 'RTX 3050', 'RTX ™ 3050': 'RTX 3050',
                                       'RTX3050': 'RTX 3050', 'GTX1650': 'GTX 1650', 'GTX 1650': 'GTX 1650', 'RTX3060TI': 'RTX 3060 Ti', 'RTX3050': 'RTX 3050',
                                       'RTX4080': 'RTX 4080', 'RTX3060': 'RTX 3060', 'GTX 1050T': 'GTX 1050T', 'GTX1630': 'GTX 1630', 'RTX4090': 'RTX 4090',
                                       'GTX 1660': 'GTX 1660', 'RX6400': 'RX 6400', 'GT1030': 'GT 1030', 'RTX ™ 3090': 'RTX 3090', 'RX6500XT': 'RX 6500 XT',
                                       'RTX 2060': 'RTX 2060', 'GTX 1660T': 'GTX 1660T', 'ARC A750': 'ARC A750', 'RTX™ 4080': 'RTX 4080', 'RTX™ 3060': 'RTX 3060',
                                       'RTX™ 3060T': 'RTX 3060 Ti', 'RTX4070TI': 'RTX 4070 Ti', 'RTX™ 4070T': 'RTX 4070 Ti', 'GT730': 'GT 730', 'RX560': 'RX 560',
                                       'GTX1660': 'GTX 1660', 'GTX1050TI': 'GTX 1050 Ti', 'RX6600XT': 'RX 6600XT', 'RTX™3060': 'RTX 3060'})

In [11]:
gpu_df['model'].value_counts()

model
GTX 1650       14
RTX 3050       12
RTX 4070        8
RTX 3060 Ti     6
RTX 3060        5
RTX 4080        5
GTX 1660        3
GTX 1630        2
RTX 4090        2
RTX 4070 Ti     2
GTX 1050T       2
GTX 1660T       1
GT 1030         1
RTX 3090        1
RX 6500 XT      1
ARC A750        1
RX 6400         1
GT 730          1
RX 560          1
GTX 1050 Ti     1
RX 6600XT       1
RTX 2060        1
Name: count, dtype: int64

In [12]:
gpu_df['memory size'].value_counts()

memory size
4GB     20
8GB     19
12GB    14
16GB     5
6GB      5
24GB     3
4G       2
2GB      2
8G       1
12G      1
Name: count, dtype: int64

In [13]:
gpu_df['memory size'] = gpu_df['memory size'].map({'4GB': '4GB', '8GB': '8GB', '12GB': '12GB', '16GB': '16GB', '6GB': '6GB',
                                     '24GB': '24GB', '4G': '4GB', '2GB': '2GB', '8G': '8GB', '12G': '12GB'})

In [14]:
gpu_df['memory size'].value_counts()

memory size
8GB     20
4GB     20
12GB    15
16GB     5
6GB      5
24GB     3
4G       2
2GB      2
Name: count, dtype: int64

In [15]:
gpu_df['memory type'].value_counts()

memory type
GDDR6     37
GDDR6X    20
GDDR5     11
DDR6       3
GDRR6      1
Name: count, dtype: int64

In [16]:
gpu_df['memory type'] = gpu_df['memory type'].map({'GDDR6': 'GDDR6', 'GDDR6X': 'GDDR6X', 'GDDR5': 'GDDR5', 'DDR6': 'GDDR6', 'GDRR6': 'GDDR6'})

In [17]:
gpu_df['memory type'].value_counts()

memory type
GDDR6     41
GDDR6X    20
GDDR5     11
Name: count, dtype: int64

In [18]:
gpu_df.to_csv('../gpu_specs_prices.csv', index=False)