In [11]:
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import pandas as pd

df = pd.read_csv("clean_products.csv", lineterminator='\n')

cvec = CountVectorizer()
X = df.product_name

y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test, test_size = 0.5)

cvec = CountVectorizer(ngram_range=(1,3)).fit(X_train)


df_train = pd.DataFrame(cvec.transform(X_train).todense(), columns=cvec.get_feature_names())
df_test = pd.DataFrame(cvec.transform(X_test).todense(), columns=cvec.get_feature_names())


ValueError: too many values to unpack (expected 2)

In [3]:

model = LinearRegression()
model.fit(df_train, y_train)

In [4]:
model.score(df_test, y_test)

-8.750641721559657e+20

In [5]:
y_train_pred = model.predict(df_train)
y_test_pred = model.predict(df_test)
print(y_train_pred)
print(y_test_pred)

[4.95238477e+03 1.57494141e+02 1.19337891e+02 ... 5.89472656e+01
 5.50195312e+00 1.75781250e-02]
[ 4.47019801e+14  5.73616046e+14 -6.40747536e+12 ...  4.64477781e+14
  2.51220681e+14 -5.39595830e+13]


In [8]:
import numpy as np
train_loss = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
print(train_loss)
test_loss = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
print(test_loss)

304.3251701951875
1113809310459003.5


In [10]:
feats = cvec.get_feature_names_out()
print(feats)

['00' '00 20' '00 20 meters' ... 'zz top collection' 'zz top pincushion'
 'zz top sharp']


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [35]:
df = pd.read_csv("clean_products.csv", lineterminator='\n')

In [36]:
X = df[['product_name', 'product_description', 'location']]
y = df['price']

In [37]:
# Applies transforer to panda data columns
# Here our transforrmer is TfidfVectorizer - this is equivalemt to CountVectorizer then TfidfTransformer being applied
# CountVectorizer is a count of words whereas TfidfVectorizer also considers word frequency
# The features extracted will be ?
# CountVectorizer makes each word a feature?
# TfidfVectorizer makes what a feature?
# Questions - there are parameters in TfidfVectorizer to make text lower case, strip non-ascii etc
# I have done this in the cleaning routines - which is better?

transformer = ColumnTransformer(
    [('vect1', TfidfVectorizer(), 'product_name'),
     ('vect2', TfidfVectorizer(), 'product_description'),
     ('vect3', TfidfVectorizer(), 'location')],
    remainder='passthrough'
)

In [38]:
# pipeline is the estimator object?
# we are saying here we want to perform a linear regression on the columns we have transformed above?
pipeline = Pipeline(
    [
        ("colt", transformer),
        ("lr", LinearRegression())
    ]
)

In [39]:
# n-gram is a contiguous sequence of n items from a given sample of text or speech. 
# could be phonemes, syllables, letters, words or base pairs according to the application
# are we looking for common sets of words here?  or something with likelihood of words being together e.g. dining table in a bi-gram?
# and creating features based on n-grams?
# so colt__vect1__ngram_range': ((1, 1), (1, 2)) - does this mean we will look for unigrams and bigrams in product name
# and colt__vect2__ngram_range': ((1, 1), (1, 2), (1, 3)) means we will look also for trigrams?
# and so we don't do this for location presumably as this doesn't seem an applicable thing here 
# since something like Belfast, Antrim would always be together
# min_df - ignore terms with a frequency lower than specified - why 3 values here and why small values? why not just 1?
# as above for location assume frequency not so relevant in town / county names?

parameters = {
    'colt__vect1__ngram_range': ((1, 1), (1, 2)),
    'colt__vect1__min_df': (0.005, 0.008, 0.01),
    'colt__vect2__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'colt__vect2__min_df': (0.005, 0.008, 0.01),
}

In [40]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [42]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

In [43]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [44]:
grid_search.predict(X_test)

array([-4997.16528355,  -546.41851481, -7948.31478231, ...,
       13968.20501163, 14723.58231354, 26811.481346  ])

In [45]:
grid_search.predict(X_train)

array([-3269.92008848, -1012.18461858, -8596.53848605, ...,
        3952.72007395, -6935.17915401, -7453.43367289])

In [46]:
np.sqrt(metrics.mean_squared_error(y_test, grid_search.predict(X_test)))

305353.46543396456

In [47]:
grid_search.score(X_train, y_train)

0.5487193448455817

In [48]:
grid_search.score(X_test, y_test)

-0.0014390211271162467