In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score
from prettytable import PrettyTable

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer 
import category_encoders

from sklearn.metrics import r2_score

## PREPROCESSING

In [8]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def getSentimentLabel(X): 
    avg = np.mean(X['quality'])
    X['sentiment'] = X['quality'].apply(lambda x: np.min([2,int(x/30)]))
    
    return X
    
## BinatyEncoding
def doPreproc(X_d,X_e,labels):
       
    df = pd.concat([X_d,X_e])

    encoder = category_encoders.BinaryEncoder(cols=labels)
    df = encoder.fit_transform(df)
    
    ev = df[df['quality'].isna()].drop(columns=['quality'])
    dev = df.dropna(subset=['quality'])

    return dev, ev

In [9]:
## LOADING - ENCODING - SENTIMENT LABELING
X_dev = loadData('Dataset/dev.tsv')
X_eval = loadData('Dataset/eval.tsv')
X_dev = X_dev.drop_duplicates()
X_dev = X_dev.drop(columns=['region_2'])
X_eval = X_eval.drop(columns=['region_2'])


X_prep_v, X_eval_v = doPreproc(X_dev,X_eval,['country','province','variety',
                                                  'winery','region_1','designation'])

X_prep_v = getSentimentLabel(X_prep_v)

## Sentiment Analysis

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords as sw
import Stemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [20]:
X_descriptions = pd.concat([X_dev,X_eval])['description']
y_sentiment_train = X_prep_v['sentiment']

stop_words_list = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 
                                         'could', 'might', 'must', "n't", 
                                         'need', 'sha', 'wo', 'would']

In [21]:
tfidf_naive = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None, 
                              stop_words=stop_words_list, ngram_range=(1,2))

X_tfidf = tfidf_naive.fit_transform(X_sentiment)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[:X_dev.shape[0]], y_sentiment_train, test_size=0.25, random_state=42)

In [None]:
model = SGDClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)

print(accuracy_score(y_test,y_pred_knn))

0.7520346238886014


In [None]:
knn = KNeighborsClassifier(weights='distance')
knn.fit(X_train,y_train)
y_pred_knn2 = knn.predict(X_test)

print(accuracy_score(y_test,y_pred_knn2))

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)
y_pred_svc = svc.predict(X_test)

print(accuracy_score(y_test,y_pred_svc))

In [None]:
### 
def getSentimentLabelV2(X): 
    avg = np.mean(X['quality'])
    X['sentiment'] = X['quality'].apply(lambda x: 1 if x >= avg else 0)
    
    return X

y_sentiment_01 = getSentimentLabel(X_prep_v)['sentiment']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf[:X_dev.shape[0]], y_sentiment_01, test_size=0.25, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_01 = KNeighborsClassifier()
knn_01.fit(X_train,y_train)
y_pred_knn01 = knn_01.predict(X_test)

print(accuracy_score(y_test,y_pred_knn01))