In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

import re
import requests
from collections import Counter
from bs4 import BeautifulSoup
from time import sleep

%matplotlib inline

# Nikon Lens Data


Scraping lens specification data from Nikon's website using Selenium.

Nikon Lenses Data: http://www.nikonusa.com/en/nikon-products/camera-lenses/all-lenses/index.page


### Table of Contents

1. [Scrape Data](#1.-Scrape-Data)
2. [Preprocessing](#2.-Preprocessing)
3. [Feature Extraction](#3.-Feature-Extraction)
4. [Save Final Dataframe](#4.-Save-Final-Dataframe)


### 1. Scrape Data

In [2]:
lens_data_url = 'http://www.nikonusa.com/en/nikon-products/camera-lenses/all-lenses/index.page'

driver = webdriver.Firefox()
driver.get(lens_data_url)


close_cookies_banner_class = 'onetrust-close-btn-handler onetrust-close-btn-ui banner-close-button onetrust-lg close-icon'

try:
    close_banner_button = driver.find_element_by_class_name(close_cookies_banner_class)
    close_banner_button.click()
except NoSuchElementException:
    'Not found'
    pass

# Collect url for each lens
rows = driver.find_elements_by_xpath('//*/td[3]/h3')

lens_urls = []

for i,row in enumerate(rows):
    driver.execute_script("arguments[0].scrollIntoView();", row)
    row.click()

    button_path = '//*[@id="table-view-product-quick-view-target-{}"]/div/div[2]/div[2]/div[2]/a'.format(i)
    
    try:
        button = driver.find_element_by_xpath(button_path)

        name = row.text.strip()
        url = button.get_attribute('href')
        
    except NoSuchElementException:
        print("Failed URL: ",url)
        pass
    else:
        lens_urls.append(url)

# driver.close()

Failed URL:  https://www.nikonusa.com/en/nikon-products/product/camera-lenses/macro--portrait-2-lens-kit.html


In [3]:
exclude_urls = [
    'https://www.nikonusa.com/en/nikon-products/product/camera-lenses/af-s-teleconverter-tc-17e-ii.html',
    'https://www.nikonusa.com/en/nikon-products/product/camera-lenses/af-s-teleconverter-tc-20e-iii.html',
    'https://www.nikonusa.com/en/nikon-products/product/camera-lenses/af-s-teleconverter-tc-14e-iii.html',
    'https://www.nikonusa.com/en/nikon-products/product/camera-lenses/landscape--macro-2-lens-kit.html',
    'https://www.nikonusa.com/en/nikon-products/product/camera-lenses/macro--portrait-2-lens-kit.html'    
]

lens_urls = [url for url in lens_urls if url not in exclude_urls]

In [4]:
driver = webdriver.Firefox()

# Iterate through list of urls and collect specs

dfs = []

for url in lens_urls:    
    driver.get(url + '#tab-ProductDetail-ProductTabs-TechSpecs')
    sleep(5)

    # Extract price
    try:
        price = driver.find_element_by_class_name('product-price').text
    except:
        print('Missing price ',url)
        pass

    # Click on Tech Specs row to expand
    try:
        expand_specs = driver.find_element_by_xpath('//*[@id="selector-heading-0"]')
        
        driver.execute_script("arguments[0].scrollIntoView();", expand_specs)
        sleep(2)
        driver.execute_script("arguments[0].click();", expand_specs)
    except NoSuchElementException:
        print('Specs tab ',url)
        pass

    # Load into BeautifulSoup
    r = requests.get(driver.current_url)
    soup = BeautifulSoup(r.text, "lxml")

    # Create list of row elements containing data
    rows = soup.findAll('li', {'class' : 'spec-content row'})
    specs = {}
    specs['Price'] = price
    specs['Name'] = name
    specs['Url'] = url

    for row in rows:
        col_name = row.find('h4', {'class':'spec-title col-sm-6'}).string
        values = [r for r in row.findAll('span', {'class':'value'}) if r.string is not None]

        specs[col_name] = ','.join([v.string for v in values])

    # Save to list of DataFrames
    df = pd.DataFrame.from_dict(specs, orient='index')
    dfs.append(df.T)

driver.close()

lenses = pd.concat(dfs, ignore_index=True)

### 2. Preprocessing

In [5]:
cols = [
    'Approx. Dimensions (Diameter x Length)',
    'Approx. Weight',
    'Autofocus',
    'Aspherical Elements',
    'Diaphragm Blades',
    'ED Glass Elements',
    'Filter Size',
    'Focal Length',
    'Focal Length Range',
    'Format',
    'Lens Elements',
    'Lens Groups',
    'Maximum Aperture',
    'Maximum Reproduction Ratio',
    'Minimum Aperture',
    'Minimum Focus Distance',
    'Price',
    'VR (Vibration Reduction) Image Stabilization',
    'Url',
    'Zoom Ratio'
]


df = lenses[cols]

# Rename Columns
columns = {
    'Approx. Dimensions (Diameter x Length)': 'Dimensions',
    'Approx. Weight': 'Weight',
    'VR (Vibration Reduction) Image Stabilization':'VR'
}

df = df.rename(index=str, columns=columns)

In [6]:
# Convert to numeric
cols_to_numeric = [
    'Aspherical Elements',
    'Diaphragm Blades',
    'ED Glass Elements',
    'Filter Size',
    'Focal Length',
    'Lens Elements',
    'Lens Groups',
    'Zoom Ratio'
]

for col in cols_to_numeric:
    lenses.loc[:,col] = lenses.loc[:,col].apply(pd.to_numeric)
    
# Price
def price_to_float(s):
    s = s.replace('$','')
    s = s.replace(',','')

    return float(s)
    
df['Price'] = df['Price'].apply(price_to_float)

In [7]:
# Incorrect Dimension Values
df[df['Dimensions'].apply(lambda x: x.split(',')).str.len() != 4]

Unnamed: 0,Dimensions,Weight,Autofocus,Aspherical Elements,Diaphragm Blades,ED Glass Elements,Filter Size,Focal Length,Focal Length Range,Format,Lens Elements,Lens Groups,Maximum Aperture,Maximum Reproduction Ratio,Minimum Aperture,Minimum Focus Distance,Price,VR,Url,Zoom Ratio
23,8395,"13.6,385",Yes,3.0,7.0,2.0,77,,1835.0,FX/35mm,12,8,3.5-4.5,0.2x,22-29,"0.92,0.28",749.95,,https://www.nikonusa.com/en/nikon-products/pro...,1.9
27,"2.6,1.7","9.5,270",,,,,62,20.0,,FX/35mm,12,9,2.8,,22,0.85,674.75,,https://www.nikonusa.com/en/nikon-products/pro...,


In [8]:
dimension_fixes = [
    ('83,95','3.3,83,3.7,95'),
    ('2.6,1.7','2.6,66,1.7,43')
]

for old_value, new_value in dimension_fixes:
    df.loc[df['Dimensions']==old_value,'Dimensions'] = new_value

In [9]:
# Diameter and Length columns
df['Diameter (In.)'] = df['Dimensions'].apply(lambda x: float(x.split(',')[0]))
df['Length (In.)'] = df['Dimensions'].apply(lambda x: float(x.split(',')[2]))

In [10]:
df['Weight (oz.)'] = df['Weight'].apply(lambda x: float(x.split(',')[0]))

weight_fixes = [
    (620,21.87)
]

for old_value, new_value in weight_fixes:
    df.loc[df['Weight (oz.)']==old_value,'Weight (oz.)'] = new_value

In [11]:
# Autofocus: Set Null values to "No"
df.loc[df['Autofocus'].isnull(),'Autofocus'] = 'No'

In [12]:
df['focal_length_raw'] = df['Focal Length'].fillna(df['Focal Length Range']).astype(str)

def set_focal_length(s):
    f_lengths = [float(val) for val in s.split(',')]
    
    f_lengths.sort() 

    if len(f_lengths) == 1:
        min_len = int(f_lengths[0])
        max_len = int(f_lengths[0])
    elif len(f_lengths) == 2:
        min_len, max_len = [int(f_length) for f_length in f_lengths]
        
    return min_len, max_len


f_len_cols = [
    'Min. Focal Length (mm)',
    'Max. Focal Length (mm)'
]

df[f_len_cols] = df['focal_length_raw'].apply(set_focal_length).apply(pd.Series)

df['Focal Length Diff'] = df['Max. Focal Length (mm)'] - df['Min. Focal Length (mm)']

In [13]:
format_fixes = [
    ('FX/35mm','FX'),
    ('FX/35mm,DX','FX')
]

for old_value, new_value in format_fixes:
    df.loc[df['Format']==old_value,'Format'] = new_value

In [14]:
def set_aperture_bounds(s):
    a_bounds = [float(val) for val in s.split('-')]
    
    a_bounds.sort() 

    if len(a_bounds) == 1:
        min_apt = a_bounds[0]
        max_apt = a_bounds[0]
    elif len(a_bounds) == 2:
        min_apt, max_apt = a_bounds
        
    return min_apt, max_apt


# Upper and Lower Bounds for Maximum Aperture
max_aperture_cols = ['Max. Aperture (Upper Bound)','Max. Aperture (Lower Bound)']
df[max_aperture_cols] = df['Maximum Aperture'].apply(set_aperture_bounds).apply(pd.Series)

# Upper and Lower Bounds for Minimum Aperture
min_aperture_cols = ['Min. Aperture (Upper Bound)','Min. Aperture (Lower Bound)']
df[min_aperture_cols] = df['Minimum Aperture'].apply(set_aperture_bounds).apply(pd.Series)

# Max Aperture Range
df['Max. Aperture Diff'] = df['Max. Aperture (Upper Bound)'] - df['Max. Aperture (Lower Bound)']

In [15]:
# VR: Set Null values to "No"
df.loc[df['VR'].isnull(),'VR'] = 'No'

### 3. Feature Extraction

In [16]:
def focal_length_label(s):
    '''Concatenate focal length range with aperture range'''
    focal_len, focal_length_range = s
    
    focal_len = float(focal_len)
    
    if pd.isnull(focal_len):
        # Telephoto lens
        f_lengths = focal_length_range.split(',')
        
        fMin, fMax = f_lengths
        
        name = str(fMin) + ' - ' + str(fMax) + 'mm'
    else:
        # Prime lens
        if focal_len.is_integer():
            name = str(int(focal_len)) + 'mm'
        else:
            name = str(focal_len) + 'mm'
            
    return name

df['Focal Length Label'] = df[['Focal Length','Focal Length Range']].apply(focal_length_label,axis=1)

In [17]:
def aperture_label(s):
    '''Concatenate focal length range with aperture range'''
    max_apt_upper, max_apt_lower, max_apt_str = s
    
    if max_apt_upper.is_integer():
        max_apt_upper_label = str(int(max_apt_upper))
    max_apt_upper_label = str(max_apt_upper)
    
    if max_apt_lower.is_integer():
        max_apt_lower_label = str(int(max_apt_lower))
    max_apt_lower_label = str(max_apt_lower)
    
    if '-' in max_apt_str:
        # Telephoto lens
        name = max_apt_upper_label + ' - ' + max_apt_lower_label
    else:
        # Prime lens
        name = max_apt_upper_label
            
    return 'f/' + name

df['Maximum Aperture Label'] = df[['Max. Aperture (Upper Bound)','Max. Aperture (Lower Bound)','Maximum Aperture']].apply(aperture_label,axis=1)

In [18]:
# Concatenate focal length range with aperture range'''
df['name'] = df['Focal Length Label'] + ' ' + df['Maximum Aperture Label']

In [19]:
# Define lens type (Prime vs. Zoom)
def lens_type(s):
    # Return "Prime" lens type if Focal Length Range is Null
    if pd.isnull(s):
        return 'Prime'
    return 'Zoom'

df['Lens Type'] = df['Focal Length Range'].apply(lens_type)

In [20]:
bins = [0, 24, 35, 85, 135, 300, 1000]

labels = ['Ultra Wide Angle',
          'Wide Angle',
          'Standard',
          'Short Telephoto',
          'Medium Telephoto',
          'Super Telephoto']

df['Focal Length Group'] = pd.cut(df['Max. Focal Length (mm)'],
                                  bins=bins,
                                  include_lowest=True,
                                  right=False,
                                  labels=labels)

In [21]:
# Weight in Pounds
df['Weight (lbs.)'] = df['Weight (oz.)'] * 0.0625

In [22]:
price_range_bins = [0, 500, 1000, 2000, 5000, 10000, 50000]

price_range_labels = [
'< $500',
'$500 - $1k',
'$1 - 2k',
'$2 - 5k',
'$5 - 10k',
'$10k+'
]

df['Price Range'] = pd.cut(df['Price'],
                              bins=price_range_bins,
                              include_lowest=True,
                              right=False,
                              labels=price_range_labels)

### 4. Save Final Dataframe

In [23]:
drop_cols = [
    'Focal Length',
    'Focal Length Range',
    'Minimum Aperture',
    'Weight',
    'Dimensions',
    'focal_length_raw'
]

final_df = df.drop(drop_cols,axis=1)

In [24]:
final_df.to_csv('nikon_lenses_202009016.csv',index=False)