In [39]:
# import libraries
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn import metrics

In [2]:
# read cleaned data
products = pd.read_csv("Products_cleaned.csv",
            lineterminator="\n",
            usecols=["product_name", "product_description", "location", "price"]
)

In [3]:
# define feature matrix X and response vector y
X = products.drop("price", axis=1)
y = products.price

print("feature matrix", X.head())
print("response vector", y.head())

feature matrix                                         product_name  \
0  Mirror wall art | in Wokingham, Berkshire | Gu...   
1  Stainless Steel Food Steamer | in Inverness, H...   
2  Sun loungers | in Skegness, Lincolnshire | Gum...   
3  Coffee side table from Ammunition ammo box hai...   
4  Modern Shannon Sofa for sale at low cost | in ...   

                                 product_description                location  
0  Mirror wall art. Posted by Nisha in Dining, Li...    Wokingham, Berkshire  
1  Morphy Richard’s (model no 48755)Stainless ste...     Inverness, Highland  
2  I have 2 of these - collection only as I don’t...  Skegness, Lincolnshire  
3  Great reclaimed army ammunition box used as co...      Radstock, Somerset  
4  New Design Shannon Corner sofa  5 Seater Avail...       Delph, Manchester  
response vector 0      5.0
1     20.0
2     20.0
3    115.0
4    450.0
Name: price, dtype: float64


In [4]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5009, 3)
(5009,)
(2147, 3)
(2147,)


In [29]:
# make column transform to transform text columns of feature matrix
column_trans = make_column_transformer(
    (TfidfVectorizer(), "product_name"),
    (TfidfVectorizer(), "product_description"),
    (TfidfVectorizer(), "location")
    )

In [30]:
# define a pipeline
pipe = make_pipeline(column_trans, LinearRegression())

# transform columns and train the model
pipe.fit(X_train, y_train)

In [36]:
# make predictions on the test set
y_pred = pipe.predict(X_test)

In [41]:
# calculate rmse of predictions
print("RMSE of model = {}".format(
    metrics.mean_squared_error(y_test, y_pred, squared=False)))

RMSE of model = 137120.96773567252
