In [3]:
import numpy as np
import pandas as pd
from skrebate.vlsrelief import VLSRelief

In [4]:
train = pd.read_csv('../processed-datasets/small-train.csv', index_col=False)

In [5]:
def process_df(df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
    df = df.drop('Color', axis=1)
    df['Rating'] = df['Rating'].str.replace(',', '.')
    df['Rating'] = df['Rating'].astype(float)

    # if is_train:
    #     df['Brand weight'] = ""
    #     for category in df['Brand'].unique():
    #         total_rating = df.loc[df['Brand'] == category].count() / df.shape[0]
    #         df.loc[df['Brand'] == category, 'Brand weight'] = total_rating

    df = df.drop([
        'Name',
        'Category',
        'Max price',
        'Min price',
        'Final price',
        'Average price',
        'full_category',
        'Basic Sale Price',
        'Seller',
        'Base price'
    ], axis=1)
    df['Days in stock/sales'] = df['Days with sales'].div(df['Days in stock'], axis=0).apply(
        lambda x: 1.0 if x >= 1.0 else x)
    df['Comments-Rating'] = df['Comments'] * df['Rating']
    df['Rating-Days-Comments'] = np.exp(df['Days with sales']) * df['Rating'] * df['Comments']

    return df.dropna() if is_train else df

In [6]:
origin_train = process_df(train, True)

origin_train

Unnamed: 0,Brand,Comments,Sales,Days in stock,Days with sales,Rating,Basic Sale,Days in stock/sales,Comments-Rating,Rating-Days-Comments
0,Bosch,0,2,23,2,0.0,0,0.086957,0.0,0.000000e+00
1,Red Beard,5,4,4,3,5.0,50,0.750000,25.0,5.021384e+02
2,Данилов Иван Валерьевич ИП,0,2,5,2,0.0,0,0.400000,0.0,0.000000e+00
3,RIPOMA,1,1,27,1,2.0,70,0.037037,2.0,5.436564e+00
4,ARC,4,47,20,18,5.0,25,0.900000,20.0,1.313199e+09
...,...,...,...,...,...,...,...,...,...,...
112,BIRONI,0,5,31,1,0.0,0,0.032258,0.0,0.000000e+00
113,BASEUS,0,0,9,0,0.0,20,0.000000,0.0,0.000000e+00
114,Sid&Ko,0,2,31,2,0.0,30,0.064516,0.0,0.000000e+00
115,TM SAKURA,20,6,30,4,3.0,33,0.133333,60.0,3.275889e+03


In [7]:
origin_train.shape

(110, 10)

In [11]:
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder

features = origin_train.drop(['Sales'], axis=1).apply(LabelEncoder().fit_transform)

poly = PolynomialFeatures(degree=2)
transformed_features = poly.fit_transform(features)
target = origin_train['Sales']

poly_feature_headers = poly.get_feature_names_out(features.columns.values)
print(poly_feature_headers)

vls = VLSRelief("surfstar", n_features_to_select=5, verbose=True)
res: VLSRelief = vls.fit(X=transformed_features, y=target.values, headers=poly_feature_headers)

res

['1' 'Brand' 'Comments' 'Days in stock' 'Days with sales' 'Rating'
 'Basic Sale' 'Days in stock/sales' 'Comments-Rating'
 'Rating-Days-Comments' 'Brand^2' 'Brand Comments' 'Brand Days in stock'
 'Brand Days with sales' 'Brand Rating' 'Brand Basic Sale'
 'Brand Days in stock/sales' 'Brand Comments-Rating'
 'Brand Rating-Days-Comments' 'Comments^2' 'Comments Days in stock'
 'Comments Days with sales' 'Comments Rating' 'Comments Basic Sale'
 'Comments Days in stock/sales' 'Comments Comments-Rating'
 'Comments Rating-Days-Comments' 'Days in stock^2'
 'Days in stock Days with sales' 'Days in stock Rating'
 'Days in stock Basic Sale' 'Days in stock Days in stock/sales'
 'Days in stock Comments-Rating' 'Days in stock Rating-Days-Comments'
 'Days with sales^2' 'Days with sales Rating' 'Days with sales Basic Sale'
 'Days with sales Days in stock/sales' 'Days with sales Comments-Rating'
 'Days with sales Rating-Days-Comments' 'Rating^2' 'Rating Basic Sale'
 'Rating Days in stock/sales' 'Rating C

In [12]:
np.array(res.feat_score)[res.top_features_]

array([[ 4.10000000e+01,  1.01254042e-01],
       [ 2.90000000e+01,  5.00813453e-02],
       [ 4.80000000e+01,  2.55878705e-02],
       [ 0.00000000e+00,  0.00000000e+00],
       [ 4.00000000e+01, -1.81818182e-02],
       [ 4.70000000e+01, -2.16900642e-02],
       [ 2.30000000e+01, -3.80386635e-02],
       [ 3.60000000e+01, -4.27135025e-02],
       [ 4.50000000e+01, -4.49082798e-02],
       [ 4.60000000e+01, -5.50975471e-02],
       [ 3.00000000e+01, -5.77817248e-02],
       [ 1.50000000e+01, -5.79036754e-02],
       [ 3.30000000e+01, -6.00807538e-02],
       [ 2.70000000e+01, -9.26582740e-02],
       [ 3.00000000e+00, -9.89490641e-02],
       [ 6.00000000e+00, -1.07949157e-01],
       [ 1.40000000e+01, -1.11019166e-01],
       [ 5.00000000e+00, -1.22315475e-01],
       [ 1.20000000e+01, -1.33002498e-01],
       [ 1.60000000e+01, -1.54540076e-01],
       [ 2.80000000e+01, -1.59670505e-01],
       [ 9.00000000e+00, -1.59726160e-01],
       [ 1.00000000e+01, -1.60054142e-01],
       [ 3.

In [13]:
# print(res.header_top_features_)
# print(np.array(res.feat_score)[res.top_features_][:,1], np.array(res.headers)[res.top_features_])
reports = pd.DataFrame({'Score': np.array(res.feat_score)[res.top_features_][:,1], 'Name': np.array(res.headers)[res.top_features_]})
# {'Score': np.array(res.feat_score)[res.top_features_], 'Name': np.array(res.headers)[res.top_features_]}
# reports['Score'] = np.array(res.feat_score)[res.top_features_]

reports.to_csv('../processed-datasets/feature_socres.csv')
reports

Unnamed: 0,Score,Name
0,0.101254,Rating^2
1,0.050081,Days in stock Rating
2,0.025588,Basic Sale Comments-Rating
3,0.0,1
4,-0.018182,Days with sales Rating-Days-Comments
5,-0.02169,Basic Sale Days in stock/sales
6,-0.038039,Comments Basic Sale
7,-0.042714,Days with sales Rating
8,-0.044908,Rating Rating-Days-Comments
9,-0.055098,Basic Sale^2
