In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('data for task 1 - test_shop.csv')
data.head()

Unnamed: 0,SubtreeLength_Bento,TreeLevel,allAbsoluteImageArea,allPseudoBG,allPseudoImageGlobal,allPseudoImageLocal,allRelativeImageArea,allSumNumBG,allSumNumImages,allTrueBG,...,video_container,wcNoneLinks,width,xpath_abs,xpath_rel,z_index,is_shop,href_shop,href_all,href_relative
0,19,21,164280.0,0,0,0,1.0,0,1,0,...,0,3,370,/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/...,"id(""post-3658"")/div[1]/div[3]/div[1]/div[1]/di...",310.0,1,2,2,1.0
1,17,17,60996.0,0,0,0,0.501056,0,1,0,...,0,6,251,/html/body/div[1]/main[1]/div[3]/div[1]/div[2]...,"id(""maincontent"")/div[3]/div[1]/div[2]/div[1]/...",526.0,1,0,2,0.0
2,15,11,65025.0,0,0,0,0.58502,0,1,0,...,0,2,285,/html/body/div[1]/div[1]/div[1]/main[1]/div[2]...,"id(""storecommerce_categorised_product-1"")/sect...",109.0,1,3,3,1.0
3,13,22,72900.0,0,0,0,0.771429,0,1,0,...,0,3,270,/html/body/div[1]/div[1]/div[1]/div[1]/main[1]...,"id(""post-466"")/div[1]/div[1]/div[1]/div[1]/sec...",215.0,1,2,2,1.0
4,10,9,32400.0,0,0,0,0.429116,0,1,0,...,0,0,242,/html/body/div[1]/div[1]/div[1]/div[2]/main[1]...,"id(""main"")/section[2]/div[1]/ul[1]/li[6]",133.0,1,1,2,0.5


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 211 entries, SubtreeLength_Bento to href_relative
dtypes: float64(109), int64(68), object(34)
memory usage: 3.2+ MB


Handling categorical columns

In [4]:
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    print(f"Column: {col}")
    print(data[col].value_counts(normalize=True, dropna=False))
    print()

Column: applied_score
applied_score
[]                                                                                                                                          0.7915
NaN                                                                                                                                         0.0175
['First prediction:', 9, 'Second prediction :', 16, 'Third prediction :', 26, 'Final prediction :', 16, 'dom_id:', 1, 'bento_id:', 242]     0.0010
['First prediction:', 9, 'Second prediction :', 26, 'Third prediction :', 10, 'Final prediction :', 26, 'dom_id:', 1, 'bento_id:', 152]     0.0010
['First prediction:', 9, 'Second prediction :', 26, 'Third prediction :', 16, 'Final prediction :', 26, 'dom_id:', 1, 'bento_id:', 144]     0.0010
                                                                                                                                             ...  
['First prediction:', 9, 'Second prediction :', 16, 'Third prediction :', 10, 'Fin

In [5]:
missing_values = data.isnull().mean()

for col in missing_values.index:
    if missing_values[col] > 0:
        print(f"{col}: {missing_values[col]}")

applied_score: 0.0175
builder: 0.9825
builder_flags: 0.4
builder_label_id: 0.9825
classListSubtree: 0.005
classes: 0.056
content: 0.137
contentAfter: 0.1475
contentBefore: 0.1265
extracted_data: 0.4
geometryActual: 0.586
geometryText: 0.012
has_columns: 0.997
href: 0.9385
id: 0.941
idListSubtree: 0.8255
isStatic: 0.4
label_id_before_reclassify: 0.406
note: 0.7835
numDiffSocLinks: 0.4
transform: 0.9695
z_index: 0.4


In [6]:
data['cssHasborderBottomWidth'] = data['cssHasborderBottomWidth'].str.extract('(\d+)').astype(float)
data['cssHasborderLeftWidth'] = data['cssHasborderLeftWidth'].str.extract('(\d+)').astype(float)
data['cssHasborderRightWidth'] = data['cssHasborderRightWidth'].str.extract('(\d+)').astype(float)
data['cssborderTopWidth'] = data['cssborderTopWidth'].str.extract('(\d+)').astype(float)
data['background-color'] = data['background-color'].replace('rgba(0, 0, 0, 0)', 'rgba(0, 0, 0)')

Reasons to drop the columns:

For applied_score, category [] and nan are the same. They don't contain info and they form the majority. Same goes to builder_flags.

Column content does not give info.

'contentAfter', 'contentBefore', 'builder','has_columns', 'href', 'id', 'idListSubtree', 'note', 'transform' have majority missing value

'isStatic' is either False or missing, so not informative

'prediction_data' - one value


In [7]:
data.drop(['applied_score', 'builder', 'builder_flags', 'builder_label_id', 'content', 'contentAfter', 'contentBefore', 'has_columns', 'href', 'id', 'idListSubtree', 'isStatic', 'note', 'prediction_data', 'transform', 'cssBackgroundImageLink'], axis = 1, inplace=True)

In [8]:
missing_values = data.isnull().mean()

for col in missing_values.index:
    if missing_values[col] > 0:
        print(f"{col}: {missing_values[col]}")

classListSubtree: 0.005
classes: 0.056
extracted_data: 0.4
geometryActual: 0.586
geometryText: 0.012
label_id_before_reclassify: 0.406
numDiffSocLinks: 0.4
z_index: 0.4


In [9]:
data['classListSubtree'] = data['classListSubtree'].str.split(',')

Extracting info from geometryActual and geometryText

In [10]:
data['geometryActual'] = data['geometryActual'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else np.nan)

In [11]:
data['height_actual'] = data['geometryActual'].apply(lambda x: x.get('height') if pd.notnull(x) else None)
data['left_actual'] = data['geometryActual'].apply(lambda x: x.get('left') if pd.notnull(x) else None)
data['top_actual'] = data['geometryActual'].apply(lambda x: x.get('top') if pd.notnull(x) else None)
data['width_actual'] = data['geometryActual'].apply(lambda x: x.get('width') if pd.notnull(x) else None)

In [12]:
data['geometryText'] = data['geometryText'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else np.nan)

In [13]:
data['height_text'] = data['geometryText'].apply(lambda x: x.get('height') if pd.notnull(x) else None)
data['left_text'] = data['geometryText'].apply(lambda x: x.get('left') if pd.notnull(x) else None)
data['top_text'] = data['geometryText'].apply(lambda x: x.get('top') if pd.notnull(x) else None)
data['width_text'] = data['geometryText'].apply(lambda x: x.get('width') if pd.notnull(x) else None)

In [14]:
data['extracted_data'].value_counts(normalize=True, dropna=False)

extracted_data
NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

Apparently the information in other columns are extracted from this column or match with the values, so we can drop this (this is an intuitive decision but in case of some more investigation can be checked)

In [15]:
numeric_columns = data.select_dtypes(exclude=['object']).columns

for col in numeric_columns:
    unique_values = data[col].nunique()
    if unique_values < 3:
        print(f"Column: {col}")
        print(data[col].value_counts(normalize=True, dropna=False))
        print()

Column: allPseudoImageLocal
allPseudoImageLocal
0    1.0
Name: proportion, dtype: float64

Column: cssBackgroundImage
cssBackgroundImage
0    0.976
1    0.024
Name: proportion, dtype: float64

Column: cssHasAnimation
cssHasAnimation
0    1.0
Name: proportion, dtype: float64

Column: cssHasBorder
cssHasBorder
0    0.9015
1    0.0985
Name: proportion, dtype: float64

Column: cursor
cursor
1    0.932
0    0.068
Name: proportion, dtype: float64

Column: hasContBOrA
hasContBOrA
0    0.836
1    0.164
Name: proportion, dtype: float64

Column: id_14_liknes
id_14_liknes
0.0    0.9995
0.1    0.0005
Name: proportion, dtype: float64

Column: id_17_liknes
id_17_liknes
0    1.0
Name: proportion, dtype: float64

Column: id_21_liknes
id_21_liknes
0.000000    0.9975
0.076923    0.0025
Name: proportion, dtype: float64

Column: id_22_liknes
id_22_liknes
0.000000    0.999
0.021978    0.001
Name: proportion, dtype: float64

Column: id_23_liknes
id_23_liknes
0    1.0
Name: proportion, dtype: float64

Column

'allPseudoImageLocal', 'cssHasAnimation', 'id_17_liknes', 'id_23_liknes', 'largePseudoImageLocal', 'modified', 'pseudoImageLocal', 'video_container',  - one class

'cssBackgroundImage', 'cssHasBorder', 'id_14_liknes', 'id_21_liknes', 'id_22_liknes', 'id_24_liknes', 'id_25_liknes', 'id_2_liknes', 'isLeafNodeInHTML', 'localWordCount', 'localCharCount', 'numVideos'  - one class dominates

'numDiffSocLinks' - 40% missing, 59.7% one class

In [16]:
data.drop(columns = ['geometryActual', 'geometryText', 'extracted_data', 'allPseudoImageLocal', 'cssHasAnimation', 'id_17_liknes', 'id_23_liknes', 'largePseudoImageLocal', 'modified', 'pseudoImageLocal', 'video_container', 'cssBackgroundImage', 'cssHasBorder', 'id_14_liknes', 'id_21_liknes', 'id_22_liknes', 'id_24_liknes', 'id_25_liknes', 'id_2_liknes', 'isLeafNodeInHTML', 'localWordCount', 'localCharCount', 'numVideos', 'numDiffSocLinks'], axis = 1, inplace=True)

In [17]:
missing_values = data.isnull().mean()

for col in missing_values.index:
    if missing_values[col] > 0:
        print(f"{col}: {missing_values[col]}")

classListSubtree: 0.005
classes: 0.056
label_id_before_reclassify: 0.406
z_index: 0.4
height_actual: 0.586
left_actual: 0.586
top_actual: 0.586
width_actual: 0.586
height_text: 0.012
left_text: 0.012
top_text: 0.012
width_text: 0.012


In [18]:
data['classes']

0                                   product-inner-wrapper
1       item,product,product-item,slick-slide,slick-ac...
2             col-xs-6,col-sm-6,col-md-3,product-ful-widt
3       ast-article-single,product,type-product,post-7...
4       product,type-product,post-8480,status-publish,...
                              ...                        
1995                                             foot_top
1996    page-section,pt-1,js-handle-pageSection,js-id-...
1997                                         text-content
1998           new-rnm-marketing-benefit-main-content-div
1999                                blog-post,astrid-3col
Name: classes, Length: 2000, dtype: object

In [19]:
data['classListSubtree']

0       [product-inner-wrapper, product-inner, product...
1       [item, product, product-item, slick-slide, sli...
2       [col-xs-6, col-sm-6, col-md-3, product-ful-wid...
3       [ast-article-single, product, type-product, po...
4       [product, type-product, post-8480, status-publ...
                              ...                        
1995    [foot_top, container, col-md-3, f_eproducts, f...
1996    [page-section, pt-1, js-handle-pagesection, js...
1997    [text-content, inner-container, titulo, whymvc...
1998    [new-rnm-marketing-benefit-main-content-div, n...
1999    [blog-post, astrid-3col, post-thumb, attachmen...
Name: classListSubtree, Length: 2000, dtype: object

Assumption that classes and classListSubtree are highly correlated

In [20]:
data['classListSubtree'][data['classListSubtree'].isna()].index

Index([612, 1112, 1216, 1234, 1416, 1450, 1699, 1728, 1767, 1944], dtype='int64')

In [21]:
data['classes'][data['classListSubtree'][data['classListSubtree'].isna()].index]

612                                                   NaN
1112    products,row,row-small,large-columns-4,medium-...
1216                                                  NaN
1234                                                  NaN
1416                                                  NaN
1450                                                  NaN
1699                                                  NaN
1728                                                  NaN
1767                                                  NaN
1944                                                  NaN
Name: classes, dtype: object

In [22]:
data.loc[1112, 'classListSubtree'] = data.loc[1112, 'classes']


In [23]:
data.drop('classes', axis = 1, inplace= True)

Fill missing values based on the correlation of columns

In [24]:
def fill_missing_correlated_values(df, column_pairs):
    filled_df = df.copy()
    for pair in column_pairs:
        column1, column2 = pair
        
        subset = filled_df[[column1, column2]].dropna()

        X = subset[column1].values.reshape(-1, 1)
        y = subset[column2].values

        model = LinearRegression()
        model.fit(X, y)

        for index, row in filled_df.iterrows():
            if pd.isnull(row[column2]) and not pd.isnull(row[column1]):
                predicted_value = model.predict([[row[column1]]])
                filled_df.loc[index, column2] = predicted_value
    
    filled_df.dropna(subset=[column_pairs[0][0], column_pairs[0][1]], inplace=True)
    
    return filled_df

In [25]:
columns = ['height_actual', 'left_actual', 'top_actual', 'width_actual', 'height_text', 'left_text', 'top_text', 'width_text']

pairing_dict = {}
for column in columns:
    first_word = column.split('_')[0]
    if first_word not in pairing_dict:
        pairing_dict[first_word] = []
    pairing_dict[first_word].append(column)

for key, values in pairing_dict.items():
    selected_columns = data[values]
    correlation_matrix = selected_columns.corr()
    print(f"Pair: {key}")
    print(correlation_matrix)
    print()

Pair: height
               height_actual  height_text
height_actual       1.000000     0.980495
height_text         0.980495     1.000000

Pair: left
             left_actual  left_text
left_actual     1.000000   0.944931
left_text       0.944931   1.000000

Pair: top
            top_actual  top_text
top_actual    1.000000  0.997806
top_text      0.997806  1.000000

Pair: width
              width_actual  width_text
width_actual      1.000000    0.856792
width_text        0.856792    1.000000



In [26]:
column_pairs = [(col1, col2) for col1, col2 in pairing_dict.values()]
column_pairs


[('height_actual', 'height_text'),
 ('left_actual', 'left_text'),
 ('top_actual', 'top_text'),
 ('width_actual', 'width_text')]

In [27]:
data_filled = fill_missing_correlated_values(data,column_pairs)

In [28]:
missing_values_after_filled = data_filled.isnull().mean()

for col in missing_values_after_filled.index:
    if missing_values_after_filled[col] > 0:
        print(f"{col}: {missing_values_after_filled[col]}")

classListSubtree: 0.006038647342995169
label_id_before_reclassify: 0.27294685990338163
z_index: 0.2669082125603865


Impute missing values with KNNImputer

In [29]:
imputer = KNNImputer(n_neighbors=5)

In [30]:
label_id_before_reclassify_filled = imputer.fit_transform(data_filled[['label_id_before_reclassify']])

data_filled['label_id_before_reclassify'] = label_id_before_reclassify_filled

In [31]:
z_index_filled = imputer.fit_transform(data_filled[['z_index']])

data_filled['z_index'] = z_index_filled

In [32]:
data_filled['classListSubtree'].value_counts()

classListSubtree
[woocommerce-loopproduct-link, woocommerce-loop-product__link, product-image-container, product-loop-image, woocommerce-loop-product__title, qala-wishlist-button, qala-wishlist-button__icon, heart, artwork-artist, price, woocommerce-price-amount, amount, woocommerce-price-currencysymbol]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

These are CSS classes assosiated with the web element. There are many classes and keeping in a list and one-hot encode them makes no sense. It requires a lot more domain knowledge to make a decision. As I can't understand better what to do, I will drop it to continue with the model. 

In [33]:
data_filled.drop('classListSubtree', axis= 1, inplace = True)

In [34]:
data_filled.select_dtypes(include='object')

Unnamed: 0,background-color,cssColor,cssPosition,global_href_Bento,html_tag,similarSiblingsTagsList_Bento,tag,tagsSubtree,xpath_abs,xpath_rel
0,"rgba(0, 0, 0)","rgb(28, 28, 28)",relative,['https://hortisem.com/producto/pepinillo-eun-...,div,"[['div', 'div', 'div', 'a', 'img', 'div', 'h3'...",div_list,"['div', 'div', 'div', 'a', 'img', 'div', 'h3',...",/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/...,"id(""post-3658"")/div[1]/div[3]/div[1]/div[1]/di..."
1,"rgba(0, 0, 0)","rgb(51, 51, 51)",static,['https://storage.googleapis.com/10webx/2021-0...,li,"[['li', 'div', 'div', 'div', 'a', 'span', 'spa...",item_list,"['li', 'div', 'div', 'div', 'a', 'span', 'span...",/html/body/div[1]/main[1]/div[3]/div[1]/div[2]...,"id(""maincontent"")/div[3]/div[1]/div[2]/div[1]/..."
6,"rgba(0, 0, 0)","rgb(102, 102, 102)",relative,['https://www.goingreen.co.uk/product/renault-...,div,"[['div'], ['div', 'span', 'a', 'img', 'span', ...",div_list,"['div', 'span', 'a', 'img', 'span', 'a', 'div'...",/html/body/div[1]/div[6]/div[2]/div[1]/div[1]/...,"id(""wa_chpc_slider"")/div[2]"
9,"rgba(0, 0, 0)","rgb(34, 34, 34)",absolute,['https://petpricelist.com/shop/funny-i-do-wha...,div,"[['div', 'div', 'div', 'div', 'div', 'a', 'img...",div_list,"['div', 'div', 'div', 'div', 'div', 'a', 'img'...",/html/body/div[2]/main[1]/div[1]/div[6]/div[1]...,"id(""content"")/div[6]/div[1]/div[1]/div[2]"
11,"rgba(0, 0, 0)","rgb(102, 102, 102)",static,['https://storage.googleapis.com/10webx/2021-0...,div,"[['div', 'div', 'a', 'div', 'img', 'div', 'div...",div_list,"['div', 'div', 'a', 'div', 'img', 'div', 'div'...",/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/...,"id(""sw_woo_slider_widget_2"")/div[2]/div[1]/div..."
...,...,...,...,...,...,...,...,...,...,...
1977,"rgb(255, 255, 255)","rgb(92, 98, 100)",relative,['/News/ipa-estonia-host-international-womens-...,a,"[['a', 'div', 'div', 'span', 'img', 'div', 'h4...",link_list,"['a', 'div', 'div', 'span', 'img', 'div', 'h4'...",/html/body/form[1]/div[1]/div[1]/div[4]/div[1]...,"id(""ctl00_ctl00_CorePlaceHolder_DisplayPagePla..."
1978,"rgba(0, 0, 0)","rgb(103, 103, 103)",relative,['https://storage.googleapis.com/10webx/2019-0...,div,"[['div', 'div', 'div', 'div', 'div', 'video', ...",div_list,"['div', 'div', 'div', 'div', 'div', 'video', '...",/html/body/div[3]/div[1]/div[1]/div[1],"id(""ajax-content-wrap"")/div[1]/div[1]/div[1]"
1983,"rgba(0, 0, 0)","rgb(26, 26, 26)",static,['amenities.aspx'],div,"[['div'], ['div', 'div', 'div', 'div', 'div', ...",div_list,"['div', 'div', 'div', 'div', 'div', 'div', 'ul...",/html/body/div[1]/div[1]/div[1]/section[6]/div...,"id(""wrapper"")/section[6]/div[1]/div[2]"
1986,"rgba(0, 0, 0)","rgb(51, 51, 51)",static,[],div,"[['div', 'div', 'div', 'div', 'span', 'span', ...",div_list,"['div', 'div', 'div', 'div', 'span', 'span', '...",/html/body/div[1]/div[4]/div[1]/div[1]/div[2],"id(""home"")/div[1]/div[4]/div[1]/div[1]/div[2]"


In [35]:
(data_filled['similarSiblingsTagsList_Bento'].apply(lambda x: ast.literal_eval(x)[0][0] if pd.notnull(x) else None) == data_filled['html_tag']).mean()

0.9577294685990339

The first element of similarSiblingsTagsList_Bento arrays is 95.8% matching with 'html_tag'

In [36]:
data_filled.drop(columns=['similarSiblingsTagsList_Bento', 'global_href_Bento', 'tagsSubtree', 'xpath_abs', 'xpath_rel'], axis= 1, inplace=True)

In [37]:
data_filled.select_dtypes(include='object')

Unnamed: 0,background-color,cssColor,cssPosition,html_tag,tag
0,"rgba(0, 0, 0)","rgb(28, 28, 28)",relative,div,div_list
1,"rgba(0, 0, 0)","rgb(51, 51, 51)",static,li,item_list
6,"rgba(0, 0, 0)","rgb(102, 102, 102)",relative,div,div_list
9,"rgba(0, 0, 0)","rgb(34, 34, 34)",absolute,div,div_list
11,"rgba(0, 0, 0)","rgb(102, 102, 102)",static,div,div_list
...,...,...,...,...,...
1977,"rgb(255, 255, 255)","rgb(92, 98, 100)",relative,a,link_list
1978,"rgba(0, 0, 0)","rgb(103, 103, 103)",relative,div,div_list
1983,"rgba(0, 0, 0)","rgb(26, 26, 26)",static,div,div_list
1986,"rgba(0, 0, 0)","rgb(51, 51, 51)",static,div,div_list


One-Hot Encoding

In [38]:
cat_cols = ['background-color', 'cssColor', 'cssPosition', 'html_tag', 'tag']
encoded_data = pd.get_dummies(data_filled, columns=cat_cols, drop_first=False)

Model

In [39]:
X = encoded_data.drop('is_shop', axis=1)
y = encoded_data['is_shop']

In [40]:
y.value_counts(normalize=True)

is_shop
1    0.718599
0    0.281401
Name: proportion, dtype: float64

Target variable is inbalanced therefore we cannot evaluate the model with accuracy. I selected f1_score for evaluation as a balanced scale of precision and recall.

Random forest is is an essemble method which helps not to overfit, robust to noise and outliers, and can handle inbalanced classes

In [45]:
rf_classifier = RandomForestClassifier()

In [46]:
cv_scores = cross_val_score(rf_classifier, X, y, cv=5, scoring='f1')

print("Cross-Validation Scores:", cv_scores)
print("Mean F1 Score:", cv_scores.mean())

Cross-Validation Scores: [0.85920578 1.         1.         1.         0.87203791]
Mean F1 Score: 0.9462487381730457


F1 score indicated high-level performance and good balance between precision and recall.

There are many assumptions and not well-proved decisions made in the notebook but in case of more time to improve domain knowledge and to try different methods, it is possible to improve the preprocessing of the data and thus the model performance overall. 