In [1]:
#importing of dependencies
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score

## Analysis on Price vs Decription

### Data Preprocessing

In [2]:
# import the data
data = pd.read_csv("../output_data/final_wine_data_172k_test.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [3]:
# filter out all extraneaous columns
price_data = data.filter(["price", "description"], axis = 1)
price_data.head()

Unnamed: 0,price,description
0,15.0,"This is ripe and fruity, a wine that is smooth..."
1,14.0,"Tart and snappy, the flavors of lime flesh and..."
2,13.0,"Pineapple rind, lemon pith and orange blossom ..."
3,65.0,"Much like the regular bottling from 2012, this..."
4,15.0,Blackberry and raspberry aromas show a typical...


In [4]:
# split the dataset into training and testing datasets
X = price_data['description']
y = price_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Simple Linear Regression Model - Price

In [5]:
#run model through pipeline
price_linear = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LinearRegression()
)
price_linear.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 

In [6]:
# Use our model to predict a value
predicted = price_linear.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {price_linear.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Accuracy: 0.08765114373376037
Mean Squared Error (MSE): 1002.207709822421
R-squared (R2): 0.4096926120577138


### Linear SVR Model - Price

In [7]:
#run model through pipeline
price_svr = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LinearSVR()
)
price_svr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('linearsvr',
                 LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
                           intercept_scali

In [8]:
# Use our model to predict a value
predicted = price_svr.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {price_svr.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Accuracy: 0.1251978143755239
Mean Squared Error (MSE): 1464.8481038771001
R-squared (R2): 0.13719416698049747


## Analysis on Points vs Decription

In [9]:
# filter out all extraneaous columns
points_data = data.filter(["points", "description"], axis = 1)
points_data.head()

Unnamed: 0,points,description
0,87,"This is ripe and fruity, a wine that is smooth..."
1,87,"Tart and snappy, the flavors of lime flesh and..."
2,87,"Pineapple rind, lemon pith and orange blossom ..."
3,87,"Much like the regular bottling from 2012, this..."
4,87,Blackberry and raspberry aromas show a typical...


In [10]:
# split the dataset into training and testing datasets
X = points_data['description']
y = points_data['points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Simple Linear Regression Model - Points

In [11]:
#run model through pipeline
points_linear = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LinearRegression()
)
points_linear.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 

In [12]:
# Use our model to predict a value
predicted = points_linear.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {points_linear.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Accuracy: 0.7121472485887046
Mean Squared Error (MSE): 1.9864033844341002
R-squared (R2): 0.8069739225932919


### Linear SVR Model - Points

In [13]:
#run model through pipeline
points_svr = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LinearSVR(max_iter = 5000)
)
points_svr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('linearsvr',
                 LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
                           intercept_scali

In [14]:
# Use our model to predict a value
predicted = points_svr.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Accuracy: {points_svr.score(X_test, y_test)}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Accuracy: 0.7401866504231369
Mean Squared Error (MSE): 2.4558891303369204
R-squared (R2): 0.7613522766375279
