In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score
from prettytable import PrettyTable

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer 
import category_encoders

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler
import category_encoders

from sklearn.metrics import r2_score

## PREPROCESSING

In [1]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def getSentimentLabel(X): 
    median = np.median(X['quality'])
    X['sentiment'] = X['quality'].apply(lambda x: 1 if x >= median else 0)
    return X
    
## Label Encoder + Normalize
def doPreproc(X_d,X_e,labels):
    
    X_d = X_d.fillna("nan")
    X_e = X_e.fillna("nan")
    df = pd.concat([X_d,X_e])
    
    for label in labels:
        encoder = LabelEncoder()
        scaler = StandardScaler()
        
        df[label] = encoder.fit_transform(df[label])
        df[label] = scaler.fit_transform(np.array(df[label]).reshape(-1,1))
        
    
    ev = df[df['quality'].isna()].drop(columns=['quality'])
    dev = df.dropna(subset=['quality'])

    return dev, ev

In [3]:
## LOADING - ENCODING - SENTIMENT LABELING
X_dev = loadData('Dataset/dev.tsv')
X_eval = loadData('Dataset/eval.tsv')
X_dev = X_dev.drop_duplicates()
X_dev = X_dev.drop(columns=['region_2'])
X_eval = X_eval.drop(columns=['region_2'])


X_prep_v, X_eval_v = doPreproc(X_dev,X_eval,['country','province','variety',
                                                  'winery','region_1','designation'])

X_prep_v = getSentimentLabel(X_prep_v)

In [10]:
X_dev[X_dev['quality'] == 75]

Unnamed: 0,country,description,designation,province,region_1,variety,winery,quality
1118,France,"Explosively rich and unctuous, this verges on ...",Le Méal,Rhône Valley,Hermitage,Marsanne,M. Chapoutier,75.0
1796,Australia,A single-vineyard wine that contains about 85%...,Exile,South Australia,Barossa Valley,Shiraz,The Colonial Estate,75.0
2715,Germany,"Viscous, almost oily in texture, this is sever...",Wehlener Sonnenuhr Vat 10 Riesling Beerenauslese,Mosel-Saar-Ruwer,,Riesling,S.A. Prüm,75.0
2811,Italy,From Angelo Gaja's beautiful estate in coastal...,Camarcanda,Tuscany,Bolgheri,Red Blend,Ca' Marcanda,75.0
3986,France,"A glorious Champagne, now at the peak of its p...",Cuvée des Enchanteleurs Brut,Champagne,Champagne,Champagne Blend,Henriot,75.0
...,...,...,...,...,...,...,...,...
115082,France,Strength and elegance go together in this wine...,,Bordeaux,Saint-Émilion,Bordeaux-style Red Blend,Château Ausone,75.0
116611,US,"Rich and powerful, with a molten intensity, th...",Gravelly Meadow,California,Diamond Mountain District,Cabernet Sauvignon,Diamond Creek,75.0
118513,France,This is a powerful wine with complex structure...,,Burgundy,Corton,Pinot Noir,Chanson Père et Fils,75.0
120189,France,"96-98 Barrel sample. Ripe, sweet but also inte...",Barrel sample,Bordeaux,Saint-Julien,Bordeaux-style Red Blend,Château Léoville Barton,75.0


## Sentiment Analysis

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords 
import Stemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [16]:
text = X_dev['description'][0]

In [17]:
tokens_without_sw = [word for word in word_tokenize(text) if not word in stopwords.words()]

In [20]:
from textblob import TextBlob

TextBlob(tokens_without_sw[0]).sentiment.polarity

0.0

In [28]:
TextBlob('').sentiment.polarity

-0.2

In [62]:
def doMultinomialNB(x,y):
    
    model = MultinomialNB()
    
    parameters =  {
        'alpha': np.linspace(0.5, 1.5, 6),
        'fit_prior': [True, False]
    }
    
    gs = GridSearchCV(estimator=model,  
                         param_grid=parameters,
                         scoring='accuracy',
                         cv=5,
                         n_jobs=-1)

    gs.fit(x, y)
    print("\nMULTINOMIAL NB: ")
    print(f"Best parameters = {gs.best_params_}")
    print(f"Best score = {gs.best_score_ }")
    
    return gs

In [89]:
def doSGDClassifier(x,y):
    
    model = SGDClassifier()
    
    params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}
    
    gs = GridSearchCV(   estimator=model,  
                         param_grid=params,
                         cv=5,
                         n_jobs=-1)

    gs.fit(x, y)
    print("\nSGDClassifier: ")
    print(f"Best parameters = {gs.best_params_}")
    print(f"Best score = {gs.best_score_ }")
    
    return gs

In [6]:
X_descriptions = pd.concat([X_dev,X_eval])['description']
y_sentiment_train = X_prep_v['sentiment']

stop_words_list = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 
                                         'could', 'might', 'must', "n't", 
                                         'need', 'sha', 'wo', 'would']

In [168]:
tfidf_naive = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None,ngram_range=(1,3))

X_tfidf = tfidf_naive.fit_transform(X_descriptions)

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[:X_dev.shape[0]], y_sentiment_train, test_size=0.25, random_state=42)

In [170]:
model = SGDClassifier(loss='squared_hinge')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7641718022298537


In [63]:
gs = doMultinomialNB(X_tfidf[:X_dev.shape[0]], y_sentiment_train)


MULTINOMIAL NB: 
Best parameters = {'alpha': 0.5, 'fit_prior': False}
Best score = 0.7513407167672769


In [171]:
from sklearn.linear_model import SGDClassifier

gs_sgd = doSGDClassifier(X_tfidf[:X_dev.shape[0]], y_sentiment_train)


SGDClassifier: 
Best parameters = {'alpha': 0.0001, 'loss': 'modified_huber', 'penalty': 'l2'}
Best score = 0.7682057884822842


## TEST

In [98]:
y_sentiment_predicted = gs_sgd.predict(X_tfidf[X_dev.shape[0]:])

In [107]:
X_eval_v['sentiment'] = y_sentiment_predicted
X_eval_v = X_eval_v.drop(columns=['description'])
X_prep_v = X_prep_v.drop(columns=['description'])

In [None]:
y = X_prep_v.quality
X = X_prep_v.drop(columns=['quality'])

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns=['sentiment']),y, test_size=0.25, random_state=42)

In [180]:
X_train.shape, X_test.shape

((63771, 6), (21257, 6))

In [118]:
rf = RandomForestRegressor(max_features='sqrt', n_estimators=700)
rf.fit(X_train,y_train)
y_aa = rf.predict(X_test)
r2_score(y_test,y_aa)

0.818034933223023

In [120]:
def doGridSearch(model,model_name,hyperparams,X,y):
    gs = GridSearchCV(estimator=model,  
                         param_grid=hyperparams,
                         scoring='r2',
                         cv=5,
                         n_jobs=4,
                         verbose=True)

    gs.fit(X, y)
    return gs

In [121]:
hyperparams_RF = {
    "n_estimators": [500,700],
    #"criterion": ["mse", "mae"],
    "max_features": ["auto", "sqrt", "log2"],
    "random_state": [42],# always use the samet random seed
    "n_jobs": [4],# for parallelization
}

In [122]:
gs = doGridSearch(RandomForestRegressor(verbose=True), "RandomForestRegressor",hyperparams_RF,X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  8.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   31.5s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:   49.7s finished


In [123]:
print(f"Best params:\t{gs.best_params_}")
print(f"Best score:\t{gs.best_score_}")

Best params:	{'max_features': 'auto', 'n_estimators': 700, 'n_jobs': 4, 'random_state': 42}
Best score:	0.829227777928935


In [127]:
y_pred = gs.predict(X_eval_v)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    2.3s finished


In [129]:
def get_csv(ids, y):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv("submission_sentiment.csv",sep=",",index=False)

In [130]:
get_csv(list(X_eval.index),y_pred)

In [None]:
### 0.702