In [1]:
# In this notebook, we will use the polarity and length (measured as the number of words)
# of the reviews as predictor variables for the quality of a wine. We will try out several
# classifiers and calculate the metrics that give us a taste of their performance
from __future__ import unicode_literals
from textblob import TextBlob
import pandas as pd
import nltk
import numpy as np


In [2]:
wine = pd.read_csv("../input/winemag-data_first150k.csv",sep=",")

In [3]:
wine = wine.drop_duplicates()
wine= wine.dropna()

In [4]:
def polarity_function(review):#measures polarity of wine description
    opinion_wine=TextBlob(review)
    return opinion_wine.sentiment.polarity

def subjectivity_function(review): #measures subjectivity of wine description
    opinion_wine=TextBlob(review)
    return opinion_wine.sentiment.subjectivity

def words_function(review): # measures the length of the wine description by number of words
    t=TextBlob(review)
    return len(t.words)

In [5]:
wine['polarity']= wine.description.apply(polarity_function)

In [6]:
wine['subjectivity']= wine.description.apply(subjectivity_function)

In [7]:
wine['num_words'] = wine.description.apply(words_function)

In [8]:
def rating_type(score):
    if score > 88:
        return 1
    if score <= 88:
        return 0
#creating a binary variable named "quality" based on wine rating
wine['quality'] = wine.points.apply(rating_type)

In [9]:
wine_f = wine[['num_words','polarity','quality']]

In [10]:
wine_f.head()

In [11]:
wine_f.corr()

In [13]:
# let´s plot the data in the plane (number of words,polarity). 
#The color represents the quality label
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6))
plt.scatter(wine["num_words"],wine["polarity"],c=wine["quality"],s=6)
plt.xlabel('number of words in description',fontsize=14)
plt.ylabel('description polarity',fontsize=14)
plt.show()

In [14]:
# the number of words in the description is highly correlated with the wine`s quality
X = wine[['polarity','subjectivity','num_words']]
X=X.values

In [15]:
y = wine['quality']

In [16]:
# logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
clf_lr = LogisticRegression()

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [21]:
clf_lr.fit(X_train,y_train)

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [23]:
np.mean(cross_val_score(clf_lr,X,y,cv=50))*100

In [24]:
np.std(cross_val_score(clf_lr,X,y,cv=50))*100

In [25]:
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
plt.hist(cross_val_score(clf_lr,X_test,y_test,cv=50))

In [27]:
precision_score(clf_lr.predict(X_test),y_test)

In [28]:
recall_score(clf_lr.predict(X_test),y_test)

In [29]:
confusion_matrix(clf_lr.predict(X_test),y_test)

In [30]:
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
 
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
clf_lr.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, clf_lr.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print(roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')
plt.legend(loc="lower right")
plt.show()

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [32]:
clf_knn = KNeighborsClassifier(n_neighbors=19)

In [33]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state = 42)

In [34]:
clf_knn.fit(X_train,y_train)

In [35]:
np.mean(cross_val_score(clf_knn,X_test,y_test,cv=50))*100

In [36]:
np.std(cross_val_score(clf_knn,X_test,y_test,cv=50))*100

In [37]:
#finding the most appropriate number of neighbors
acc_list=[]
for n in range(1,50):
    clf_knn = KNeighborsClassifier(n_neighbors=n)
    clf_knn.fit(X_train,y_train)
    mean_acc = np.mean(cross_val_score(clf_knn,X_test,y_test,cv=50))*100
    acc_list.append(mean_acc)

In [38]:
plt.figure(figsize=(10,10))
plt.plot(range(1,50),acc_list)
plt.show()

In [39]:
precision_score(clf_knn.predict(X_test),y_test)

In [40]:
recall_score(clf_knn.predict(X_test),y_test)

In [41]:
confusion_matrix(clf_lr.predict(X_test),y_test)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
clf_knn.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, clf_knn.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print(roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for K-nearest neighbors')
plt.legend(loc="lower right")
plt.show()

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
clf_rf = RandomForestClassifier(n_estimators=2,oob_score=True,random_state=42)

In [45]:
clf_rf.fit(X_train,y_train)

In [46]:
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
clf_rf.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, clf_knn.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print(roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest')
plt.legend(loc="lower right")
plt.show()

In [47]:
precision_score(clf_rf.predict(X_test),y_test)

In [48]:
recall_score(clf_rf.predict(X_test),y_test)

In [49]:
confusion_matrix(clf_rf.predict(X_test),y_test)

In [50]:
confusion_matrix(clf_lr.predict(X_test),y_test)

In [51]:
confusion_matrix(clf_knn.predict(X_test),y_test)

In [52]:
clf_svm = LinearSVC(C=.1)
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [53]:
clf_svm.fit(X_train,y_train)

In [54]:
clf_svm.fit(X_train, y_train)
 
# Determine the false positive and true positive rates
fpr, tpr, _ = roc_curve(y_test, clf_knn.predict_proba(X_test)[:,1])
 
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print(roc_auc)
 
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Support Vector Machine')
plt.legend(loc="lower right")
plt.show()

In [55]:
confusion_matrix(clf_svm.predict(X_test),y_test)

In [56]:
accuracy_score(clf_svm.predict(X_test),y_test)

In [57]:
precision_score(clf_svm.predict(X_test),y_test)

In [58]:
recall_score(clf_svm.predict(X_test),y_test)

In [59]:
plt.hist(cross_val_score(clf_svm,X_test,y_test,cv=50))