In [1]:
# import libraries
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn import metrics

In [2]:
# read cleaned data
products = pd.read_pickle("image_product.pkl")
products

Unnamed: 0,id,product_name,category,product_description,price,location,create_time,image_array
0,7cdb36be-888e-4dc8-81ed-0b98e9e6a1c9,LUSH 1.6-1.8m Kentia palm 🌴 Indoor plant | in ...,Home & Garden / Garden & Patio / Garden Buildi...,LUSH 1.6-1.8m Kentia palm 🌴 Indoor plant\r Lus...,75.00,"Kennington, London",2022-02-26,"[[[144, 120, 82], [145, 123, 86], [149, 131, 9..."
1,369cbb17-75f4-43a3-9f67-cce821f0ed5e,Genuine Xiaomi Redmi Power Bank 10000-mAh 2.4A...,"Phones, Mobile Phones & Telecoms / Mobile Phon...",Xiaomi Redmi Power Bank 10000 mAh - WhiteMicro...,25.00,"Watford, Hertfordshire",2022-02-27,"[[[125, 136, 132], [141, 150, 145], [145, 148,..."
2,0f80c7cf-4c45-47df-a00d-0be822506933,"Mamas and Papas Kato 2 pushchair | in Swindon,...",Baby & Kids Stuff / Prams & Strollers,Pushchair c/w rain cover. Small hole in hood s...,20.00,"Swindon, Wiltshire",2022-02-26,"[[[171, 169, 157], [163, 161, 149], [155, 153,..."
3,a5886822-a581-4ab3-9efc-959d1b58f3a8,Air Jordan 4 Retro 'Taupe Haze' Size UK 6.5 - ...,"Clothes, Footwear & Accessories / Men's Shoes ...",Bought these couple weeks ago but I want to ge...,290.00,"Kettering, Northamptonshire",2022-02-26,"[[[179, 137, 121], [176, 139, 120], [172, 141,..."
4,4a877282-182e-47f5-b4b6-24f0a88ab5fd,Sony PlayStation 3 Slim Console 320GB Black | ...,Video Games & Consoles / Consoles / PS3 (Sony ...,Sony PlayStation 3 Slim Console 320GB Black. U...,140.00,"Twickenham, London",2022-02-28,"[[[160, 164, 147], [152, 159, 141], [192, 199,..."
...,...,...,...,...,...,...,...,...
11099,39026ffe-9994-4a0f-94a9-5123b3432065,CHEAPEST WARDROBE == ALINA BED ROOM SET with ...,Home & Garden / Beds & Bedroom Furniture / War...,More Information 0️⃣2️⃣0️⃣8️⃣0️⃣0️⃣4️⃣7️⃣7️⃣7️...,189.00,"Southall, London",2022-02-26,"[[[248, 236, 196], [230, 218, 180], [226, 213,..."
11100,2c353fc7-d78a-4b54-bea1-269708511a54,"Mobile Catering Trailer | in Northampton, Nort...",Office Furniture & Equipment / Restaurant & Ca...,Great condition catering trailer - newly renov...,6950.00,"Northampton, Northamptonshire",2022-02-27,"[[[157, 181, 155], [122, 146, 120], [106, 129,..."
11101,bf23f42a-9c32-4dff-ac5c-21ebe29b2a31,"Cabinet handles | in East End, Glasgow | Gumtree",DIY Tools & Materials / Doors & Windows,10 x brushed steel cabinet handles plus 6 x hi...,10.00,"East End, Glasgow",2022-02-27,"[[[181, 172, 165], [182, 173, 166], [185, 176,..."
11102,53b597af-093b-4492-8f15-cd89ea953c49,"JINBERYL KIDS BACKPACK GIRLS SCHOOL BAG, FITS ...","Clothes, Footwear & Accessories / Women's Acce...",SOLD OUT ITEM ON AMAZON - 28 AVAILABLE\r BRAND...,14.99,"Bearsden, Glasgow",2022-02-27,"[[[240, 255, 251], [237, 255, 247], [239, 255,..."


In [3]:
# define feature matrix X and response vector y
X = products.loc[:, ["product_name", "product_description", "location"]]
y = products.price

print("feature matrix", X.head())
print("response vector", y.head())

feature matrix                                         product_name  \
0  LUSH 1.6-1.8m Kentia palm 🌴 Indoor plant | in ...   
1  Genuine Xiaomi Redmi Power Bank 10000-mAh 2.4A...   
2  Mamas and Papas Kato 2 pushchair | in Swindon,...   
3  Air Jordan 4 Retro 'Taupe Haze' Size UK 6.5 - ...   
4  Sony PlayStation 3 Slim Console 320GB Black | ...   

                                 product_description  \
0  LUSH 1.6-1.8m Kentia palm 🌴 Indoor plant\r Lus...   
1  Xiaomi Redmi Power Bank 10000 mAh - WhiteMicro...   
2  Pushchair c/w rain cover. Small hole in hood s...   
3  Bought these couple weeks ago but I want to ge...   
4  Sony PlayStation 3 Slim Console 320GB Black. U...   

                      location  
0           Kennington, London  
1       Watford, Hertfordshire  
2           Swindon, Wiltshire  
3  Kettering, Northamptonshire  
4           Twickenham, London  
response vector 0     75.0
1     25.0
2     20.0
3    290.0
4    140.0
Name: price, dtype: float64


In [4]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7772, 3)
(7772,)
(3332, 3)
(3332,)


In [5]:
# make column transform to transform text columns of feature matrix
column_trans = make_column_transformer(
    (TfidfVectorizer(), "product_name"),
    (TfidfVectorizer(), "product_description"),
    (TfidfVectorizer(), "location")
    )

In [6]:
# define a pipeline
pipe = make_pipeline(column_trans, LinearRegression())

# transform columns and train the model
pipe.fit(X_train, y_train)

In [7]:
# make predictions on the test set
y_pred = pipe.predict(X_test)

In [8]:
# calculate rmse of predictions
print("RMSE of model = {}".format(
    metrics.mean_squared_error(y_test, y_pred, squared=False)))

RMSE of model = 101695.29352303351
