In [1]:
#importing of dependencies
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score

## Analysis on Price vs Decription

### Data Preprocessing

In [2]:
# import the data
data = pd.read_csv("../output_data/final_wine_data_172k_test.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [3]:
# filter out all extraneaous columns
price_data = data.filter(["price", "description"], axis = 1)
price_data.head()

Unnamed: 0,price,description
0,15.0,"This is ripe and fruity, a wine that is smooth..."
1,14.0,"Tart and snappy, the flavors of lime flesh and..."
2,13.0,"Pineapple rind, lemon pith and orange blossom ..."
3,65.0,"Much like the regular bottling from 2012, this..."
4,15.0,Blackberry and raspberry aromas show a typical...


In [4]:
# split the dataset into training and testing datasets
X = price_data['description']
y = price_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression Model - Price

In [5]:
%%time

price_linear = make_pipeline(
    CountVectorizer(stop_words='english'),
    LinearRegression(),
    verbose = True
)

price_linear.fit(X_train, y_train)

# Use our model to predict a value
predicted = price_linear.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {price_linear.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

[Pipeline] ... (step 1 of 2) Processing countvectorizer, total=   6.3s
[Pipeline] .. (step 2 of 2) Processing linearregression, total= 2.1min
Accuracy: 0.07710227583994823
Mean Squared Error (MSE): 995.0904139567624
R-squared (R2): 0.4138847493666692
CPU times: user 8min 22s, sys: 15.8 s, total: 8min 38s
Wall time: 2min 23s


### Linear SVR Model - Price

In [6]:
%%time

price_svr = make_pipeline(
    CountVectorizer(stop_words='english'),
    LinearSVR(),
    verbose = True
)

price_svr.fit(X_train, y_train)

# Use our model to predict a value
predicted = price_svr.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {price_svr.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

[Pipeline] ... (step 1 of 2) Processing countvectorizer, total=   5.9s
[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=   2.8s
Accuracy: 0.17360323820281465
Mean Squared Error (MSE): 1362.8005901295742
R-squared (R2): 0.1973008701079163
CPU times: user 19.8 s, sys: 284 ms, total: 20 s
Wall time: 18.9 s


## Analysis on Points vs Decription

In [7]:
# filter out all extraneaous columns
points_data = data.filter(["points", "description"], axis = 1)
points_data.head()

Unnamed: 0,points,description
0,87,"This is ripe and fruity, a wine that is smooth..."
1,87,"Tart and snappy, the flavors of lime flesh and..."
2,87,"Pineapple rind, lemon pith and orange blossom ..."
3,87,"Much like the regular bottling from 2012, this..."
4,87,Blackberry and raspberry aromas show a typical...


In [8]:
# split the dataset into training and testing datasets
X = points_data['description']
y = points_data['points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression Model - Points

In [9]:
%%time

points_linear = make_pipeline(
    CountVectorizer(stop_words='english'),
    LinearRegression(),
    verbose = True
)

points_linear.fit(X_train, y_train)

# Use our model to predict a value
predicted = points_linear.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {points_linear.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

[Pipeline] ... (step 1 of 2) Processing countvectorizer, total=   6.7s
[Pipeline] .. (step 2 of 2) Processing linearregression, total= 2.2min
Accuracy: 0.694569549798849
Mean Squared Error (MSE): 2.078629122564729
R-squared (R2): 0.798012010523068
CPU times: user 8min 41s, sys: 15.2 s, total: 8min 57s
Wall time: 2min 26s


### Linear SVR Model - Points

In [10]:
%%time

points_svr = make_pipeline(
    CountVectorizer(stop_words='english'),
    LinearSVR(max_iter = 5000),
    verbose = True
)

points_svr.fit(X_train, y_train)

# Use our model to predict a value
predicted = points_svr.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {points_svr.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

[Pipeline] ... (step 1 of 2) Processing countvectorizer, total=   5.4s
[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  24.0s
Accuracy: 0.7334843020068273
Mean Squared Error (MSE): 2.207642750926373
R-squared (R2): 0.7854752846949706
CPU times: user 38.7 s, sys: 235 ms, total: 38.9 s
Wall time: 37.8 s
