In [1]:
#import required packages
#basics
import pandas as pd 
import numpy as np

import warnings


#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image

import string
import re    #for regex
import nltk
from nltk.corpus import stopwords

#import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize


#FeatureEngineering
#!pip install lightgbm
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, decomposition, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer,LabelEncoder
from sklearn.model_selection import train_test_split

import textblob
import xgboost
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from textblob import TextBlob
from nltk.stem import PorterStemmer
import nltk
nltk.download('wordnet')
from textblob import Word


color = sns.color_palette()
sns.set_style("dark")
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")

lem = WordNetLemmatizer()

%matplotlib inline

Using TensorFlow backend.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.metrics import  roc_auc_score
from sklearn.metrics import classification_report

In [2]:
import category_encoders as ce

In [4]:
# Create Data audit Report for categorical variables
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), (x.isnull().sum()/x.size)*100, Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2 ), x.unique().size], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT', 'UNIQUE'])

In [6]:
# Create Data audit Report for continuous variables
def continuous_var_summary(x):
    return pd.Series([x.count(),  (x.isnull().sum()/x.size)*100, x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [7]:
def onehot_features(train, test, features, full=False, sparse=False, dummy_na=True):

    features = [f for f in features if f in train.columns]
    for column in features:
        if full:
            categories = pd.concat([train[column], test[column]]).dropna().unique()
        else:
            categories = train[column].dropna().unique()

        train[column] = train[column].astype("category", categories=categories)
        test[column] = test[column].astype("category", categories=categories)

    train = pd.get_dummies(train, columns=features, dummy_na=dummy_na, sparse=sparse)
    test = pd.get_dummies(test, columns=features, dummy_na=dummy_na, sparse=sparse)
    return train, test

In [5]:
train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')

In [9]:
train = train.drop_duplicates(['review_description','review_title']).reset_index(drop=True)

In [10]:
data_cat_vars = train.loc[:, (train.dtypes == 'object')]

In [11]:
cat_info = data_cat_vars.apply(categorical_var_summary).T
cat_info

Unnamed: 0,N,NMISS,MODE,FREQ,PERCENT,UNIQUE
user_name,59105,23.874,@vossroger,16482,27.89,16
country,77608,0.0425033,US,36269,46.73,39
review_title,77641,0.0,Segura Viudas NV Extra Dry Sparkling (Cava),7,0.01,76983
review_description,77641,0.0,"Stalky aromas suggest hay and green herbs, wit...",2,0.0,77628
designation,55458,28.5712,Reserve,1301,2.35,26425
province,77608,0.0425033,California,24275,31.28,359
region_1,65680,15.4055,Napa Valley,3150,4.8,1020
region_2,33814,56.4483,Central Coast,7053,20.86,18
winery,77641,0.0,Testarossa,165,0.21,13786
variety,77641,0.0,Pinot Noir,9966,12.84,28


## REMOVING ALL THE CATEGORICAL COLUMNS WITH MORE THAN 15% MISSING VALUES

In [12]:
# encoding featurs with more than 50 categories is usesless
train.drop(columns = cat_info[((cat_info.NMISS>=15) | (cat_info.UNIQUE>=100)) & (cat_info.index!='review_description')].index.values, inplace=True) 
data_cat_vars = train.loc[:, (train.dtypes == 'object')]

In [13]:
data_cat_vars.apply(categorical_var_summary).T

Unnamed: 0,N,NMISS,MODE,FREQ,PERCENT,UNIQUE
country,77608,0.0425033,US,36269,46.73,39
review_description,77641,0.0,"Stalky aromas suggest hay and green herbs, wit...",2,0.0,77628
variety,77641,0.0,Pinot Noir,9966,12.84,28


In [14]:
wine_conti_vars = train.loc[:, (train.dtypes == 'float64') | (train.dtypes == 'int64')]

In [15]:
wine_conti_vars.apply(continuous_var_summary).T.round(2)

Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
points,77641.0,0.0,6874429.0,88.54,88.0,3.14,9.87,80.0,82.0,84.0,84.0,86.0,88.0,91.0,93.0,94.0,95.0,100.0
price,72356.0,6.81,2687315.0,37.14,27.0,44.63,1991.41,4.0,8.0,10.0,13.0,17.0,27.0,45.0,67.0,89.0,169.0,3300.0


## Dropping all the rows where price was NULL from both test and train

In [16]:
train.drop(index = train[train.price.isna()].index, inplace=True)

In [17]:
test.price.isna().sum()

1394

In [18]:
wine_conti_vars = train.loc[:, (train.dtypes == 'float64') | (train.dtypes == 'int64')]

In [19]:
wine_conti_vars.apply(continuous_var_summary).T.round(2)

Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
points,72356.0,0.0,6403941.0,88.51,88.0,3.15,9.9,80.0,82.0,83.0,84.0,86.0,88.0,91.0,93.0,94.0,95.0,100.0
price,72356.0,0.0,2687315.0,37.14,27.0,44.63,1991.41,4.0,8.0,10.0,13.0,17.0,27.0,45.0,67.0,89.0,169.0,3300.0


In [20]:
z=train.groupby(['country'])['price','points'].mean().reset_index().sort_values('price',ascending=False)
z[['country','price']].head(n=10)

Unnamed: 0,country,price
33,Switzerland,94.75
10,England,51.538462
18,Italy,46.738333
13,Germany,44.724688
11,France,44.122072
35,US,37.568786
29,Serbia,34.666667
5,Canada,33.470199
17,Israel,32.914676
2,Austria,31.97049


In [23]:
train.columns

Index(['country', 'review_description', 'points', 'price', 'variety'], dtype='object')

In [24]:
train.dropna(subset=['country'], inplace=True)

In [25]:

OHE = ce.OneHotEncoder(cols=['country'],use_cat_names=True)
# encode the categorical variables
train = OHE.fit_transform(train)


In [26]:
train.columns

Index(['country_Australia', 'country_US', 'country_Italy', 'country_France',
       'country_Argentina', 'country_New Zealand', 'country_Austria',
       'country_Chile', 'country_Portugal', 'country_Germany', 'country_Spain',
       'country_England', 'country_Brazil', 'country_South Africa',
       'country_Romania', 'country_Slovenia', 'country_Greece',
       'country_Canada', 'country_Israel', 'country_Turkey', 'country_Lebanon',
       'country_Mexico', 'country_Uruguay', 'country_Bulgaria',
       'country_Hungary', 'country_Moldova', 'country_Morocco',
       'country_Georgia', 'country_Ukraine', 'country_Croatia', 'country_Peru',
       'country_Switzerland', 'country_Luxembourg', 'country_Cyprus',
       'country_Czech Republic', 'country_Macedonia', 'country_Serbia',
       'country_India', 'review_description', 'points', 'price', 'variety'],
      dtype='object')

In [27]:
encode_cols = []
for col in train.columns:
    if 'country' in col:
        encode_cols.append(col)

In [28]:
features = train.columns.difference(['variety'])
X = train[features]
y = train['variety']

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.10, random_state=42)

In [30]:
extras = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'cab',"%"]
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop.update(extras)

In [31]:
def pre_process(text):
    text = text.apply(lambda x: x.replace('/',''))                        #Replacing the / with none
    text = text.apply(lambda x: re.sub("  "," ", x))          #Replacing double space with single space
    text = text.apply(lambda x: re.sub(r'''[-()\"#/@;:{}`+=~|.!?,']''', "", x))     #Replacing special character with none
    text = text.apply(lambda x: re.sub(r'[0-9]+', '', x))                        #Replacing numbers with none
    text = text.apply(lambda x: " ".join(x.translate(str.maketrans('', '', string.punctuation)) for x in x.split() if x.isalpha()))
    text = text.apply(lambda x: " ".join(PorterStemmer().stem(word) for word in x.split())) #Stemming using porter stemmer
    text = text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))   #lemmatization
    return(text)

In [32]:
X_train.review_description = pre_process(X_train.review_description)
X_val.review_description = pre_process(X_val.review_description)

In [33]:
from scipy.sparse import hstack

# vect = CountVectorizer(analyzer='word', 
#                         token_pattern=r'\w{1,}', 
#                         ngram_range=(1, 2 ), 
#                         min_df=5, 
#                         encoding='latin-1' ,
#                         max_features=800,
#                         stop_words = stop)

tfidf_vect = TfidfVectorizer()
X_train_dtm = tfidf_vect.fit_transform(X_train.review_description)
X_val_dtm = tfidf_vect.transform(X_val.review_description)

price_train = X_train['price'].values[:,None]
price_val = X_val['price'].values[:,None]
points_train = X_train['points'].values[:,None]
points_val = X_val['points'].values[:,None]
country_train = X_train[encode_cols].values
country_val = X_val[encode_cols].values
X_train_dtm = hstack((X_train_dtm,country_train, price_train, points_train))
X_val_dtm = hstack((X_val_dtm, country_val, price_val, points_val))

In [None]:
def pre_process(X, y, tfidf_vect):
    dtm = tfidf_vect.

In [34]:
X_train_dtm

<65094x23234 sparse matrix of type '<class 'numpy.float64'>'
	with 2371326 stored elements in COOrdinate format>

In [35]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [51]:
import xgboost as xgb

In [54]:
estimator_function = xgb.XGBClassifier(max_depth=16,
                                    min_child_weight=10,
                                           learning_rate= 0.2,
                                           n_estimators= 1000,
                                           objective='multi:softmax',
                                           num_class= 28,
                                           nthread=4,
                                           n_jobs=4,
                                           seed=42)

In [55]:
estimator_function.fit(X_train_dtm, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=0, max_depth=16,
              min_child_weight=10, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=4, nthread=4, num_class=28,
              num_parallel_tree=1, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, seed=42,
              subsample=1, tree_method=None, validate_parameters=False,
              verbosity=None)

In [58]:
pred = estimator_function.predict(X_val_dtm)

In [61]:
import pickle
with open('xgb.pkl', 'wb') as file:
    pickle.dump(estimator_function, file)

In [59]:
print(classification_report(y_val, pred, target_names=le.classes_))

                            precision    recall  f1-score   support

  Bordeaux-style Red Blend       0.76      0.75      0.75       444
Bordeaux-style White Blend       0.51      0.42      0.46        50
            Cabernet Franc       0.71      0.39      0.50        96
        Cabernet Sauvignon       0.67      0.72      0.69       719
           Champagne Blend       0.63      0.68      0.65        72
                Chardonnay       0.79      0.86      0.82       844
                     Gamay       0.77      0.61      0.68        67
            Gewürztraminer       0.70      0.55      0.62        80
          Grüner Veltliner       0.80      0.92      0.86        76
                    Malbec       0.70      0.62      0.65       190
                    Merlot       0.63      0.46      0.53       246
                  Nebbiolo       0.81      0.86      0.83       178
              Pinot Grigio       0.68      0.61      0.65        67
                Pinot Gris       0.66      0.49

In [60]:
accuracy_score(y_val, pred)*100

73.30291718512373

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [66]:
#Choose best parameters for randomforest

def best_params(train_x, train_y):
    rfc = RandomForestClassifier()
    param_grid = { 
        'n_estimators': [50, 200],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train_dtm, y_train)
    return CV_rfc.best_params_

print(best_params(X_train_dtm, y_train))

In [64]:
rf = RandomForestClassifier(n_estimators=200, max_features='auto', n_jobs=-1).fit(X_train_dtm, y_train)
print('Cross Validation for RandomForestClassifier')
print(rf.score(X_val_dtm, y_val))


Cross Validation for RandomForestClassifier
0.6817364855523296


In [None]:
import pickle
with open('rf.pkl', 'wb') as file:
    pickle.dump(rf, file)

## LGBMC CLASSIFIER

In [36]:
import lightgbm as lgb

In [37]:
d_train = lgb.Dataset(X_train_dtm.toarray(), label=y_train)

In [38]:
params = {}
params['learning_rate'] = 0.2
params['boosting_type'] = 'gbdt'
params['n_estimators'] = 500
params['objective'] = 'multiclass'
params['metric'] = 'multi_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 150
params['min_data'] = 50
params['max_depth'] = 10
params['num_class'] = 28
params['n_jobs'] = -1

In [39]:
clf = lgb.train(params, d_train, 200)

In [43]:
pred = clf.predict(X_val_dtm.toarray())

In [46]:
ans = []
for arr in pred:
    ans.append(arr.argmax())
ans = np.array(ans)

In [47]:
ans

array([10, 14,  5, ..., 14, 20, 21])

In [48]:
accuracy_score(y_val, ans)*100

74.18775058758467

In [50]:
import pickle
with open('LGBMC.pkl', 'wb') as file:
    pickle.dump(clf, file)

## Working on test set

In [None]:
test.review_description = pre_process(test.review_description)

In [None]:
X_test_dtm = tfidf_vect.transform(test.review_description)

price_test = test['price'].values[:,None]
X_test_dtm = hstack((X_test_dtm, price_test))

In [None]:
test_pred = estimator_function.predict(X_test_dtm)

In [None]:
test_var_pred = le.inverse_transform(test_pred)

In [None]:
test_var_pred.size

In [None]:
test.shape

## BAYESIAN OPTIMIZATION WAS TAKING TOO LONG WASN'T PERFORMED

In [None]:
def xgbc_cv(max_depth,learning_rate,n_estimators,reg_alpha):
    from sklearn.metrics import roc_auc_score
    import numpy as np
    
    estimator_function = xgb.XGBClassifier(max_depth=int(max_depth),
                                           learning_rate= learning_rate,
                                           #min_child_weight = 10,
                                           n_estimators= int(n_estimators),
                                           reg_alpha = reg_alpha,
                                           nthread = -1,
                                           objective='multi:softmax',
                                           num_class= 28,
                                           n_jobs=-1,
                                           seed=42)
    # Fit the estimator

    estimator_function.fit(X_train_dtm, y_train)
    
    # calculate out-of-the-box roc_score using validation set 1
    pred = estimator_function.predict(X_val_dtm)
    accuracy_score(y_val ,pred)
    
    # return the mean validation score to be maximized 
    return accuracy_score(y_val ,pred)

In [None]:
from bayes_opt import BayesianOptimization
gp_params = {"alpha": 1e-10}



hyperparameter_space = {
    'max_depth': (5, 20),
    'learning_rate': (0, 0.3),
    'n_estimators' : (500,1000),
    'reg_alpha': (0,1)
}

xgbcBO = BayesianOptimization(f = xgbc_cv, 
                             pbounds =  hyperparameter_space,
                             random_state = 42,
                             verbose = 10)

# Finally we call .maximize method of the optimizer with the appropriate arguments
# kappa is a measure of 'aggressiveness' of the bayesian optimization process
# The algorithm will randomly choose 3 points to establish a 'prior', then will perform 
# 10 interations to maximize the value of estimator function
xgbcBO.maximize(init_points=3,n_iter=10,acq='ucb', kappa= 3, **gp_params)