In [1]:
# Importing the necessary packages
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
import time
import numpy as np

In [2]:
# Make directory if it doesn't already exist
folder_name = 'squarelabs_site'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [3]:
# List of URLs for iPhones reviewed by Squarelabs
durability_urls = ['https://labs.squaretrade.com/breakability/iphone-11-11-pro-and-11-pro-max-breakability', 
                  'https://labs.squaretrade.com/breakability/iphone-xs-ks-max-breakability',
                  'https://labs.squaretrade.com/breakability/iphone-x-breakability',
                  'https://labs.squaretrade.com/breakability/iphone-8-8-plus-breakability',
                  'https://labs.squaretrade.com/breakability/iphone-se-vs-iphone-6s-6s-plus-breakability']
headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}

In [4]:
# Implement for loop for related iPhone reviews
# ---
# The below code basically downloads .html file for all above URLs; there are probably much better way to handle
# this but at the moment, this is a method I am the most familiar with. At the same time, the downloaded site in html format
# allows me to read through the tags and class as necessary too.
# ---
for url in durability_urls:
    time.sleep(3) # wait 3 seconds so not to trigger the security
    response = requests.get(url, headers=headers)
    
    with open(os.path.join(folder_name, url.split('/')[-1])+'.html', mode='wb') as file:
        file.write(response.content) 

In [5]:
# I choose the method of building up my dataframe using list and thus, I am starting with blank list for all my needed variables
df_list = []
phone_models_os = [] #To house the list of phone models reviewed for the overall score
overall_score = [] #To house the overall score for phone models reviewed
phone_models_ts = [] #To house the list of phone models reviewed for the various other tests
test_names = [] #To house the list of various test names
test_scores = [] #To house the list of various scores

#The reason I have two lists of phone_models is because it is easier for me to just create two tables and merge it at the end
#Again, there is probably a much simpler way to do this, but this is the method I am more familiar with at the moment

# Loop through the html files downloaded
for reviews in os.listdir(folder_name):
        with open(os.path.join(folder_name, reviews)) as file:
            # Make the soup
            soup = BeautifulSoup(file)

            # Get the phone model & overall_score
            high_scores = soup.find_all(class_='score-info-row')
            for d in high_scores:
                phone_model = d.find(class_='scorecard-title').find('h3').contents[0]
                phone_models_os.append(phone_model)
                overall_score.append(d.find(class_='score-col-wrap').find('p').contents[0])
                
                # Get the detailed scores for various tests
                detailed_scores = d.find_all(class_='scorecard-item table-wrap')
                for e in detailed_scores:
                    phone_models_ts.append(phone_model)
                    test_names.append(e.find(class_='table-cell-item').contents[0])
                    
                    # Notice that for iPhone XS page, there is one null value for Repairability score which throws an error;
                    # below code is for exception handling
                    try:
                        test_score = e.find(class_='table-cell-item score-value').contents[0]
                        test_scores.append(test_score)
                    except IndexError:
                        test_scores.append(np.nan)

In [6]:
# Create dataframe for overall score
df_overall_score = pd.DataFrame({'phone_models': pd.Series(phone_models_os),
                  'overall_score': pd.Series(overall_score)})
df_overall_score

Unnamed: 0,phone_models,overall_score
0,iPhone 11,73
1,iPhone 11 Pro,65
2,iPhone 11 Pro Max,85
3,iPhone 8,67
4,iPhone 8 Plus,74
5,Samsung Galaxy Note8,80
6,iPhone SE,55
7,iPhone 6s,40
8,iPhone 6s Plus,65
9,iPhone X,90


In [7]:
# Create dataframe for the various tests and associated scores
df_vartest_score = pd.DataFrame({'phone_models': pd.Series(phone_models_ts),
                   'test_name': pd.Series(test_names),
                   'test_score': pd.Series(test_scores)})

In [8]:
df_vartest_score.head()

Unnamed: 0,phone_models,test_name,test_score
0,iPhone 11,Face Down Drop,80
1,iPhone 11,Back Down Drop,80
2,iPhone 11,Side Drop,10
3,iPhone 11,Water Test,60
4,iPhone 11,Bendability Test,35


In [9]:
# Check for values in test_name as I noticed that there are some variations in earlier iterations
df_vartest_score_clean = df_vartest_score.copy()
print(df_vartest_score_clean.test_name.unique())
print(df_vartest_score_clean.test_name.nunique())

['Face Down Drop' 'Back Down Drop' 'Side Drop' 'Water Test'
 'Bendability Test ' 'Tumble Test' 'Materials ' 'Repairability '
 'Bendability Test' 'Materials' 'Repairability' 'Shot Bot' 'Shot Drop'
 'Gripability' 'Weight' 'Corner Drop' 'Bendability']
17


In [10]:
# Remove trailing white spaces for 'Materials ', 'Repairability ', 'Bendability Test '
df_vartest_score_clean.test_name = df_vartest_score_clean.test_name.str.strip()
print(df_vartest_score_clean.test_name.unique())
print(df_vartest_score_clean.test_name.nunique())

['Face Down Drop' 'Back Down Drop' 'Side Drop' 'Water Test'
 'Bendability Test' 'Tumble Test' 'Materials' 'Repairability' 'Shot Bot'
 'Shot Drop' 'Gripability' 'Weight' 'Corner Drop' 'Bendability']
14


In [11]:
# Clean up Bendability Test and Bendability - rename everything to just Bendability
df_vartest_score_clean.test_name = df_vartest_score_clean.test_name.replace('Bendability Test', 'Bendability')
print(df_vartest_score_clean.test_name.unique())
print(df_vartest_score_clean.test_name.nunique())

['Face Down Drop' 'Back Down Drop' 'Side Drop' 'Water Test' 'Bendability'
 'Tumble Test' 'Materials' 'Repairability' 'Shot Bot' 'Shot Drop'
 'Gripability' 'Weight' 'Corner Drop']
13


In [12]:
# Change all test names to lower case and replace space with underscore
df_vartest_score_clean.test_name = df_vartest_score_clean.test_name.str.lower().str.replace(' ', '_')

In [13]:
df_vartest_score_clean.head()

Unnamed: 0,phone_models,test_name,test_score
0,iPhone 11,face_down_drop,80
1,iPhone 11,back_down_drop,80
2,iPhone 11,side_drop,10
3,iPhone 11,water_test,60
4,iPhone 11,bendability,35


In [14]:
df_vartest_score_piv = df_vartest_score_clean.pivot(index='phone_models', columns='test_name', values='test_score').reset_index()
df_vartest_score_piv.head()

test_name,phone_models,back_down_drop,bendability,corner_drop,face_down_drop,gripability,materials,repairability,shot_bot,shot_drop,side_drop,tumble_test,water_test,weight
0,Samsung Galaxy Note8,90.0,60,,100,,80.0,70.0,70.0,,40.0,75,30,
1,iPhone 11,80.0,35,,80,,80.0,80.0,,,10.0,70,60,
2,iPhone 11 Pro,90.0,30,,100,,60.0,85.0,,,10.0,20,10,
3,iPhone 11 Pro Max,60.0,90,,100,,80.0,90.0,,,25.0,75,60,
4,iPhone 6s,,30,30.0,70,50.0,,,,,,10,60,50.0


In [15]:
# Create a copy for cleanliness purpose
df_overall_score_merge = df_overall_score.copy()

In [16]:
# Merge the two tables to get overall score and various scores into a single table
df_overall_score_merge = df_overall_score_merge.merge(df_vartest_score_piv, how='left', on='phone_models')
df_overall_score_merge

Unnamed: 0,phone_models,overall_score,back_down_drop,bendability,corner_drop,face_down_drop,gripability,materials,repairability,shot_bot,shot_drop,side_drop,tumble_test,water_test,weight
0,iPhone 11,73,80.0,35,,80,,80.0,80.0,,,10.0,70,60,
1,iPhone 11 Pro,65,90.0,30,,100,,60.0,85.0,,,10.0,20,10,
2,iPhone 11 Pro Max,85,60.0,90,,100,,80.0,90.0,,,25.0,75,60,
3,iPhone 8,67,40.0,30,,70,,50.0,70.0,50.0,,80.0,80,30,
4,iPhone 8 Plus,74,90.0,60,,90,,90.0,70.0,,40.0,30.0,60,30,
5,Samsung Galaxy Note8,80,90.0,60,,100,,80.0,70.0,70.0,,40.0,75,30,
6,iPhone SE,55,,40,90.0,70,30.0,,,,,,20,100,30.0
7,iPhone 6s,40,,30,30.0,70,50.0,,,,,,10,60,50.0
8,iPhone 6s Plus,65,,20,40.0,70,70.0,,,,,,80,100,80.0
9,iPhone X,90,100.0,30,,100,,60.0,80.0,70.0,,100.0,100,30,


In [17]:
# Create a csv file from dataframe
df_overall_score_merge.to_csv('iphone_durability_score.csv')

In [18]:
df_overall_score_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 0 to 11
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   phone_models    12 non-null     object
 1   overall_score   12 non-null     object
 2   back_down_drop  9 non-null      object
 3   bendability     12 non-null     object
 4   corner_drop     3 non-null      object
 5   face_down_drop  12 non-null     object
 6   gripability     3 non-null      object
 7   materials       9 non-null      object
 8   repairability   8 non-null      object
 9   shot_bot        3 non-null      object
 10  shot_drop       1 non-null      object
 11  side_drop       9 non-null      object
 12  tumble_test     12 non-null     object
 13  water_test      12 non-null     object
 14  weight          3 non-null      object
dtypes: object(15)
memory usage: 1.5+ KB
