In [306]:
import requests
import lxml.html
import regex
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Get Headphones listed on rtings.com

In [348]:
url_listings = "https://www.rtings.com/headphones/1-5/graph"

# get page
page = requests.get(url_listings)

# use beautiful soup to parse it
soup = BeautifulSoup(page.text, "html.parser")

# find the products info, which is located in one of the scripts
scripts = soup.find_all("script")
for script in scripts:
    # find products info using regular expression, which does the following
    #   Look behind ( ?<= ): 
    #       "products_info"
    #       followed by any number of whitespaces (\s*)
    #       followed by an equal sign (=)
    #   After this look behind, search for:
    #       any number of any character (also newlines etc.) except the semicolon ( [^;]* )
    items_json = regex.search("(?<=products_info\s*=)[^;]*", script.text)

    # continue to next script if nothing was found
    if items_json is None:
        continue

    items_json = items_json.group().strip()

    items_dict = json.loads(items_json)

    items_list = [item for _, item in items_dict.items()]

    items = pd.DataFrame(items_list)
    items = items.set_index('id')

items.to_parquet('dat/items.parquet')

uqique_comp_ids = np.unique([aa for a in items.comp_ids.values for aa in a])

items

Unnamed: 0_level_0,name,brand,comp_ids
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
718,Piston Fit,1More,"[7913, 7914, 7943, 7966, 7981, 7988, 8001, 8010]"
468,Piston Classic,1More,"[7913, 7914, 7943, 7966, 7981, 7988, 8001, 8010]"
469,Quad Driver,1More,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
432,Triple Driver,1More,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
29111,Timeless,7HZ,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
...,...,...,...
22722,Wireless Headset,Xbox,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
27623,Stereo Headset,Xbox,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
1693,True Wireless Earbuds,Ylife,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."
4325,Boostcare,iClever,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793..."


In [367]:
def to_columns(columns):
    c2 = []
    for idx, c1 in enumerate(columns):
        c = c1.lower().replace(' ', '_')        
        n = c2.count(c)
        s = '' if n == 0 else str(n+1)
        c2.append(c + s)
    return c2

to_columns(data_json['header'])

['frequency', 'left', 'right', 'target_response', 'left2', 'right2']

In [360]:
data_json['header'][:5].count('Left')

2

In [381]:
from pathlib import Path

In [388]:
pths = [str(p) for p in Path('dat').glob('data*.parquet')]

comp_id = 7903
for item_id, row in items.iterrows():
    pth = Path(f'dat/data_{item_id}_{comp_id}.parquet')
    
    print(f'downloading: {str(pth)}...')
    # 
    if comp_id not in row.comp_ids:
        print(f'   ...skipped because item {item_id} does not have comp_id {comp_id}')
        continue

    # skip if already downloaded
    if pth.exists():
        print(f'   ...skipped, because file already exists')
        continue
        
    data_raw = requests.get(url=f"https://www.rtings.com/graph/data/{item_id}/{comp_id}")
    data_json = json.loads(data_raw.text)

    data = pd.DataFrame(data_json['data'], columns=to_columns(data_json['header']))
    data.to_parquet(pth)
    



downloading: dat\data_718_7903.parquet...
   ...skipped because item 718 does not have comp_id 7903
downloading: dat\data_468_7903.parquet...
   ...skipped because item 468 does not have comp_id 7903
downloading: dat\data_469_7903.parquet...
   ...skipped, because file already exists
downloading: dat\data_432_7903.parquet...
downloading: dat\data_29111_7903.parquet...
downloading: dat\data_368_7903.parquet...
   ...skipped because item 368 does not have comp_id 7903
downloading: dat\data_703_7903.parquet...
   ...skipped because item 703 does not have comp_id 7903
downloading: dat\data_6678_7903.parquet...
downloading: dat\data_707_7903.parquet...
downloading: dat\data_752_7903.parquet...
downloading: dat\data_761_7903.parquet...
downloading: dat\data_235_7903.parquet...
   ...skipped because item 235 does not have comp_id 7903
downloading: dat\data_327_7903.parquet...
downloading: dat\data_332_7903.parquet...
downloading: dat\data_326_7903.parquet...
downloading: dat\data_234_7903.par

In [443]:
def combine(l1, l2):
    if np.isnan(l1):
        return l2
    if np.isnan(l2):
        return l1
    return (l1 + l2) / 2


def grade_freq(data):
    if 'right' not in data.columns:        
        amplitude_avg = data.apply(
            lambda x: combine(x.left, x.left2),
            axis="columns",
        )
    else:
        amplitude_avg = data.apply(
            lambda x: combine(combine(x.left, x.left2), combine(x.right, x.right2)),
            axis="columns",
        )
    std = amplitude_avg.std()
    return std


# grade frequency response of each item
pths = list(Path("dat").glob("data_*_7903.parquet"))
items['freq_grade'] = pd.NA
for pth in pths:
    data = pd.read_parquet(pth)
    item_id = int(pth.stem.split("_")[1])
    try:
        items.freq_grade[item_id] = grade_freq(data)
    except:
        print(item_id)
        display(data)

In [465]:
items[items.apply(lambda x: 'wireless' not in x['name'].lower(), axis='columns')].sort_values('freq_grade').head(50)

Unnamed: 0_level_0,name,brand,comp_ids,freq_grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14780,AKG Type-C,Samsung,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.689195
30204,KATO,MOONDROP,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.715932
18088,QuietComfort 35 II Gaming Headset​,Bose,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.776274
440,DT 770 PRO,Beyerdynamic,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.93519
237,QuietComfort 25/QC25,Bose,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.004572
295,ATH-M50x,Audio-Technica,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.026819
25784,Aria,MOONDROP,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.04797
29111,Timeless,7HZ,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.178738
432,Triple Driver,1More,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.210041
689,Mobius,Audeze,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.299962


In [452]:
items.sort_values('freq_grade').iloc[30:80]

Unnamed: 0_level_0,name,brand,comp_ids,freq_grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25383,Tour Pro+ TWS True Wireless,JBL,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.980435
33844,Barracuda Pro Wireless,Razer,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",2.998389
900,WI-C310 Wireless,Sony,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.003418
237,QuietComfort 25/QC25,Bose,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.004572
26805,Tune 760NC Wireless,JBL,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.012256
295,ATH-M50x,Audio-Technica,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.026819
567,Gear IconX Truly Wireless,Samsung,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.034675
25784,Aria,MOONDROP,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.04797
505,G533 Wireless,Logitech,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.074513
15396,T1X True Wireless,FIIL,"[7903, 7913, 7914, 7917, 7918, 7920, 7927, 793...",3.083024


In [430]:
import matplotlib.pyplot as plt



# plt.figure(figsize=(20, 10))
# plt.semilogx(data.frequency, data.dat-np.mean(data.dat), marker='.')


5.568637301489934

In [373]:
data.columns

Index(['frequency', 'left', 'right', 'target_response', 'left2', 'right2'], dtype='object')

In [431]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(20, 10))
# for c in data.columns:
#     if 'frequency' not in c:
#         plt.plot(data.frequency, data[c])

# data.std()