In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import nltk
import time
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import KBinsDiscretizer


# Classifiers

#neigh = MultinomialNB(alpha = 0.5);

#neigh = SVC(gamma='auto')

#neigh = MLPClassifier(verbose=0, random_state=45, max_iter=600, solver="adam", learning_rate="adaptive", activation="relu", hidden_layer_sizes=(300,300,300))

neigh = RandomForestClassifier(n_estimators=30000, max_depth=25,random_state=0)


# Regressors

#neigh =  LinearRegression();

#neigh = SVR(gamma='auto');

#neigh = MLPRegressor(verbose=0, random_state=42, max_iter=800, solver="adam", learning_rate="adaptive", activation="tanh", hidden_layer_sizes=(300,300,300))

#neigh = RandomForestRegressor(n_estimators = 3000, max_depth=15, random_state=0);



# Import the dataset
wine = pd.read_csv("winemag-data-130k-v2.csv")

# Drop any rows that have missing data or data that cannot be read
wine = wine.replace([np.inf, -np.inf], np.nan);
wine = wine.replace("NaN", np.nan);
wine = wine.replace(b'?', np.nan);
wine = wine.dropna();

# Manually feature select columns that will be dropped from the dataset as they are not useful to our project
#wine = wine.drop("region_2",axis=1);
#wine= wine.drop("taster_twitter_handle",axis=1);
#wine= wine.drop("title",axis=1);
#wine= wine.drop("id", axis=1);
#wine= wine.drop("designation", axis=1);
#wine= wine.drop("winery", axis=1);
#wine= wine.drop("taster_name", axis=1);
#wine= wine.drop("region_1", axis=1);

# Show the formatted dataset
wine.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
23,23,US,This wine from the Geneseo district offers aro...,Signature Selection,87,22.0,California,Paso Robles,Central Coast,Matt Kettmann,@mattkettmann,Bianchi 2011 Signature Selection Merlot (Paso ...,Merlot,Bianchi
25,25,US,Oak and earth intermingle around robust aromas...,King Ridge Vineyard,87,69.0,California,Sonoma Coast,Sonoma,Virginie Boone,@vboone,Castello di Amorosa 2011 King Ridge Vineyard P...,Pinot Noir,Castello di Amorosa
35,35,US,As with many of the Erath 2010 vineyard design...,Hyland,86,50.0,Oregon,McMinnville,Willamette Valley,Paul Gregutt,@paulgwine,Erath 2010 Hyland Pinot Noir (McMinnville),Pinot Noir,Erath


In [2]:
# Import the package for sentiment analysis

# This is for binning the data, comment out for regression
wine['points'] = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform').fit_transform(wine['points'].values.reshape(-1, 1));

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia= SentimentIntensityAnalyzer()

# Take the descriptions for each row and get the sentiment analysis score for each description, creating a new column in the dataset to store all the scores in
wine.insert(1,'sentiment_score',"")
for index, row in wine.iterrows():
    passage=wine.loc[index, "description"]
    wine.loc[index,'sentiment_score']=(sia.polarity_scores(passage)['compound'] + 1)

# Show the dataset with the sentiment score in it
wine.head()

Unnamed: 0,id,sentiment_score,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
4,4,1.8176,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,1.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,1.8176,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,1.0,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
23,23,0.6818,US,This wine from the Geneseo district offers aro...,Signature Selection,1.0,22.0,California,Paso Robles,Central Coast,Matt Kettmann,@mattkettmann,Bianchi 2011 Signature Selection Merlot (Paso ...,Merlot,Bianchi
25,25,1.4019,US,Oak and earth intermingle around robust aromas...,King Ridge Vineyard,1.0,69.0,California,Sonoma Coast,Sonoma,Virginie Boone,@vboone,Castello di Amorosa 2011 King Ridge Vineyard P...,Pinot Noir,Castello di Amorosa
35,35,1.6293,US,As with many of the Erath 2010 vineyard design...,Hyland,1.0,50.0,Oregon,McMinnville,Willamette Valley,Paul Gregutt,@paulgwine,Erath 2010 Hyland Pinot Noir (McMinnville),Pinot Noir,Erath


In [3]:


# Import the package for label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# The sentiment score and price should not be label encoded
z = wine['sentiment_score'];
p = wine['price'];
wine = wine.drop('sentiment_score',axis=1);
wine = wine.drop('price',axis=1);
wine = wine.drop('description',axis=1)

# Apply le on categorical feature columns
categorical_feature_mask = wine.dtypes==object

# Filter categorical columns using mask and turn it into a list
categorical_cols = wine.columns[categorical_feature_mask].tolist()
wine[categorical_cols] = wine[categorical_cols].apply(lambda col: le.fit_transform(col))

# Set the target to be the points column
y = wine["points"]

# Store everything but the points column in X
X = wine.drop("points",axis=1)


# Place the score and price back in the dataset
X['sentiment_score'] = z;
X['price'] = p;
X.head()

Unnamed: 0,id,country,designation,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,sentiment_score,price
4,4,0,8991,2,164,16,6,4,17492,111,2368,1.8176,65.0
10,10,0,5382,0,82,6,10,6,9958,19,1343,1.8176,19.0
23,23,0,7415,0,93,1,4,3,2114,74,274,0.6818,22.0
25,25,0,4234,0,132,12,10,6,3424,111,462,1.4019,69.0
35,35,0,3866,2,67,16,6,4,6528,111,851,1.6293,50.0


In [4]:
# Comment out the next lines (X = ....to print(scores.mean())) for regression methods, uncomment them for classification

#X = X.drop('country', axis=1);
#X = X.drop('province', axis=1);
#X = X.drop('variety', axis=1);
#X = X.drop('price', axis=1);



start_time = time.time();

scores = cross_val_score(neigh, X, y, cv = 10);

print(scores.mean())

0.6123108835363038


In [5]:
end_time = time.time() - start_time;
print(end_time)

15309.920785665512


In [6]:
# Comment out the remaining lines for classification, uncomment them for regression

start_time = time.time();

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2)

reg = neigh;

reg.fit(trainX, trainY)

prediction = reg.predict(testX)

from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
r2_score(testY, prediction)

0.307961418617275

In [7]:
end_time = time.time() - start_time;
print(end_time)

1241.837759256363
