## Developing and Testing Name Similarity Score Feature

In [11]:
# !pip install --upgrade pip
# !pip install nltk

# !pip install --upgrade pip
# !pip install --upgrade mxnet autogluon

# import nltk
# nltk.download('stopwords')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Collecting pip
  Using cached pip-21.0.1-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3.3
    Uninstalling pip-20.3.3:
      Successfully uninstalled pip-20.3.3
Successfully installed pip-21.0.1
Collecting mxnet
  Downloading mxnet-1.7.0.post2-py2.py3-none-manylinux2014_x86_64.whl (54.7 MB)
[K     |████████████████████████████████| 54.7 MB 19 kB/s s eta 0:00:01     |████████████████████████        | 40.9 MB 16.8 MB/s eta 0:00:01
[?25hCollecting autogluon
  Downloading autogluon-0.0.15-py3-none-any.whl (622 kB)
[K     |████████████████████████████████| 622 kB 71.2 MB/s eta 0:00:01
Collecting openml
  Downloading openml-0.11.0.tar.gz (110 kB)
[K     |████████████████████████████████| 110 kB 79.9 MB/s eta 0:00:01
Collecting ConfigSpace<=0.4.10
  Downloading ConfigSpace-0.4.10.tar.gz (882 kB)
[K     |████████████████████████████████| 882 kB 50.2 MB/s eta 0:00:01
[?25hCollecting gluoncv<0.9.0,>=0.5.0
 

In [3]:
training_data = pd.read_csv('../../data/final_project/training.csv')
test_data = pd.read_csv('../../data/final_project/public_test_features.csv')

print('The shape of the training dataset is:', training_data.shape)
print('The shape of the test dataset is:', test_data.shape)

The shape of the training dataset is: (36803, 228)
The shape of the test dataset is: (15774, 227)


In [None]:
text_features = ["key_item_name", "cand_item_name"]

df_train = training_data[text_features].astype('str')
df_test = test_data[text_features].astype('str')

In [None]:
pd.set_option("display.max_colwidth", 200)
training_data[text_features+["label"]].head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=1)
similarity = []
for i in range(df_train.shape[0]):
    train_corpus = df_train.iloc[i,:].values
    X = vectorizer.fit_transform(train_corpus)
    X = X.toarray()
    similarity += [np.dot(X,X.T)[0,1]]

In [None]:
training_data["name_similarity_score"] = similarity

In [None]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=1)
similarity = []
for i in range(df_test.shape[0]):
    test_corpus = df_test.iloc[i,:].values
    X = vectorizer.fit_transform(test_corpus)
    X = X.toarray()
    similarity += [np.dot(X,X.T)[0,1]]

In [None]:
test_data["name_similarity_score"] = similarity

In [None]:
training_data[["label","name_similarity_score"]].corr()

In [None]:
training_data["name_similarity_score"].value_counts(bins=10)

## Developing and testing imputing package weight using regressive models

In [50]:
numerical_features = ["key_fma_qualified_price_max"]

categorical_features = ["key_Product Group Description","key_is_conveyable","key_Is Sortable",
                        "key_binding_description","key_classification_description","key_item_package_quantity"]

model_features = numerical_features + categorical_features
label = ["key_pkg_length"]

df_train = training_data[model_features + label + ["ID"]]
df_test = test_data[model_features + label + ["ID"]]

df_train[categorical_features] = df_train[categorical_features].astype('str')
df_train[categorical_features] = df_train[categorical_features].astype('str')

print(df_train.shape,df_test.shape)

(36803, 9) (15774, 9)


In [51]:
df = pd.concat([df_train,df_test])
df_train_new = df[df["key_pkg_length"].notna()][model_features + label + ["ID"]]
df_test_new = df[df["key_pkg_length"].isna()][model_features + ["ID"]]
print(df_train_new.shape, df_test_new.shape)

(47682, 9) (4895, 8)


In [52]:
df_train_new.isna().sum()

key_fma_qualified_price_max       1735
key_Product Group Description        0
key_is_conveyable                    0
key_Is Sortable                      0
key_binding_description           1330
key_classification_description       0
key_item_package_quantity          793
key_pkg_length                       0
ID                                   0
dtype: int64

In [53]:
df_test_new.isna().sum()

key_fma_qualified_price_max       1743
key_Product Group Description        0
key_is_conveyable                    0
key_Is Sortable                      0
key_binding_description            687
key_classification_description       0
key_item_package_quantity          444
ID                                   0
dtype: int64

In [54]:
df_train_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47682 entries, 0 to 15772
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   key_fma_qualified_price_max     45947 non-null  float64
 1   key_Product Group Description   47682 non-null  object 
 2   key_is_conveyable               47682 non-null  object 
 3   key_Is Sortable                 47682 non-null  object 
 4   key_binding_description         46352 non-null  object 
 5   key_classification_description  47682 non-null  object 
 6   key_item_package_quantity       46889 non-null  object 
 7   key_pkg_length                  47682 non-null  float64
 8   ID                              47682 non-null  int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 3.6+ MB


In [55]:
df_train_new.corr()

Unnamed: 0,key_fma_qualified_price_max,key_pkg_length,ID
key_fma_qualified_price_max,1.0,0.376536,-0.00815
key_pkg_length,0.376536,1.0,-0.010887
ID,-0.00815,-0.010887,1.0


In [56]:
from autogluon import TabularPrediction as task

metric = 'root_mean_squared_error'

predictor = task.fit(train_data=df_train_new, 
                     label='key_pkg_length',
                     eval_metric=metric,
                    excluded_model_types=["NN"],
                    id_columns=["ID"])

No output_directory specified. Models will be saved in: AutogluonModels/ag-20210214_202047/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20210214_202047/
AutoGluon Version:  0.0.15
Train Data Rows:    47682
Train Data Columns: 8
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (94.0, 0.0, 12.82366, 10.92852)
	If 'regression' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Dropping ID columns: ['ID']
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6481.2 MB
	Train Data (Original)  Memory Usage: 18.86 MB (0.3% of available memory)
	Inferring data type of each feature based on column values. Set

[1000]	train_set's rmse: 1.77796	valid_set's rmse: 1.81066
[2000]	train_set's rmse: 1.40055	valid_set's rmse: 1.443
[3000]	train_set's rmse: 1.21122	valid_set's rmse: 1.25391
[4000]	train_set's rmse: 1.08295	valid_set's rmse: 1.12065
[5000]	train_set's rmse: 1.01185	valid_set's rmse: 1.04788
[6000]	train_set's rmse: 0.969095	valid_set's rmse: 1.00392
[7000]	train_set's rmse: 0.938919	valid_set's rmse: 0.97062
[8000]	train_set's rmse: 0.9178	valid_set's rmse: 0.948317
[9000]	train_set's rmse: 0.903266	valid_set's rmse: 0.933001
[10000]	train_set's rmse: 0.893524	valid_set's rmse: 0.922984


	-0.923	 = Validation root_mean_squared_error score
	40.93s	 = Training runtime
	4.92s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-1.0052	 = Validation root_mean_squared_error score
	95.52s	 = Training runtime
	0.04s	 = Validation runtime
Fitting model: LightGBMRegressorCustom ...
	-0.8937	 = Validation root_mean_squared_error score
	6.23s	 = Training runtime
	0.26s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.3617	 = Validation root_mean_squared_error score
	0.49s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 178.61s ...


In [57]:
predictor.leaderboard(extra_info=True, silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,num_features,...,child_model_type,hyperparameters,hyperparameters_fit,AG_args_fit,features,child_hyperparameters,child_hyperparameters_fit,child_AG_args_fit,ancestors,descendants
0,ExtraTreesRegressorMSE,-0.361663,0.208006,6.319477,0.208006,6.319477,0,True,2,6,...,,"{'n_estimators': 300, 'n_jobs': -1, 'random_st...",{'n_estimators': 300},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[weighted_ensemble_k0_l1]
1,weighted_ensemble_k0_l1,-0.361663,0.208952,6.804983,0.000946,0.485506,1,True,9,1,...,GreedyWeightedEnsembleModel,"{'max_models': 25, 'max_models_per_type': 5}",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[ExtraTreesRegressorMSE],{'ensemble_size': 100},{'ensemble_size': 7},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[ExtraTreesRegressorMSE],[]
2,RandomForestRegressorMSE,-0.362021,0.207907,8.847162,0.207907,8.847162,0,True,1,6,...,,"{'n_estimators': 300, 'n_jobs': -1, 'random_st...",{'n_estimators': 300},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[]
3,LightGBMRegressorCustom,-0.893733,0.262995,6.22969,0.262995,6.22969,0,True,8,6,...,,"{'num_boost_round': 10000, 'num_threads': -1, ...",{'num_boost_round': 795},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[]
4,LightGBMRegressor,-0.899736,0.114644,2.974011,0.114644,2.974011,0,True,5,6,...,,"{'num_boost_round': 10000, 'num_threads': -1, ...",{'num_boost_round': 772},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[]
5,LightGBMRegressorXT,-0.922984,4.921404,40.934179,4.921404,40.934179,0,True,6,6,...,,"{'num_boost_round': 10000, 'num_threads': -1, ...",{'num_boost_round': 9999},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[]
6,CatboostRegressor,-1.005224,0.035657,95.519052,0.035657,95.519052,0,True,7,6,...,,"{'iterations': 10000, 'learning_rate': 0.1, 'r...",{'iterations': 3699},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[key_fma_qualified_price_max, key_Product Grou...",,,,[],[]
7,KNeighborsRegressorUnif,-4.41966,0.104081,0.024021,0.104081,0.024021,0,True,3,1,...,,"{'weights': 'uniform', 'n_jobs': -1}",{},"{'ignored_type_group_raw': ['category', 'objec...",[key_fma_qualified_price_max],,,,[],[]
8,KNeighborsRegressorDist,-4.41966,0.104417,0.026436,0.104417,0.026436,0,True,4,1,...,,"{'weights': 'distance', 'n_jobs': -1}",{},"{'ignored_type_group_raw': ['category', 'objec...",[key_fma_qualified_price_max],,,,[],[]


In [58]:
predictor.feature_importance(df_train_new)

Computing raw permutation importance for 8 features on weighted_ensemble_k0_l1 ...
	3.08s	= Expected runtime
	2.1s	= Actual runtime


key_is_conveyable                 9.870161
key_Is Sortable                   5.809846
key_fma_qualified_price_max       5.204661
key_Product Group Description     4.164964
key_binding_description           3.536981
key_item_package_quantity         0.704571
ID                                0.000000
key_classification_description    0.000000
dtype: float64

In [59]:
predictor.get_model_names()

['RandomForestRegressorMSE',
 'ExtraTreesRegressorMSE',
 'KNeighborsRegressorUnif',
 'KNeighborsRegressorDist',
 'LightGBMRegressor',
 'LightGBMRegressorXT',
 'CatboostRegressor',
 'LightGBMRegressorCustom',
 'weighted_ensemble_k0_l1']

In [81]:
test_predictions = predictor.predict(df_test_new)
df_test_new["key_pkg_length"] = test_predictions
print(df_train_new.shape, df_test_new.shape)

(47682, 9) (4895, 10)


In [83]:
df = pd.concat([df_train_new,df_test_new])[["ID","key_pkg_length"]]
df.shape

Unnamed: 0,ID,key_pkg_length
0,34016,20.0
1,3581,4.8
2,36025,7.2
4,14628,9.2
5,12882,51.6
7,29187,10.0
8,40670,6.377953
9,8364,7.4
10,38295,7.7
11,21947,4.7


In [90]:
df_train = df_train.drop(columns=["key_pkg_length"])
df_train = df_train.merge(df,
         on=["ID"],
         how='inner'
         )
df_train.head(10)

Unnamed: 0,key_fma_qualified_price_max,key_Product Group Description,key_is_conveyable,key_Is Sortable,key_binding_description,key_classification_description,key_item_package_quantity,ID,key_pkg_length
0,111.96,gl_home,Y,N,Kitchen,Base Product,1.0,34016,20.0
1,15.71,gl_office_product,Y,Y,Office Product,Base Product,6.0,3581,4.8
2,43.37,gl_wireless,Y,Y,Electronics,Base Product,1.0,36025,7.2
3,648.63,gl_pc,Y,Y,Personal Computers,Base Product,1.0,42061,14.04
4,23.85,gl_pet_products,Y,Y,Misc.,Base Product,1.0,14628,9.2
5,1496.73,gl_home_entertainment,N,N,Electronics,Base Product,1.0,12882,51.6
6,42.16,gl_wireless,Y,Y,Electronics,Base Product,2.0,49055,4.308378
7,246.79,gl_camera,Y,Y,Electronics,Base Product,1.0,29187,10.0
8,21.1,gl_home,Y,Y,Kitchen,Base Product,,40670,6.377953
9,29.76,gl_office_product,Y,Y,Office Product,Base Product,1.0,8364,7.4


In [88]:
df_train.shape

(36803, 8)

In [87]:
df.shape

(52577, 2)

In [86]:
df_new.shape

(36803, 9)