# LogisticRegression to classify wine color (Red, White, Rose, unk)

In [1]:
# Loading the training data from a csv file with pandas
import pandas

train_set = pandas.read_csv('./train.csv', sep='\t', encoding='utf-8')
train_set

# Let's extract only these two columns from the data 
train_reviews = train_set['Review'].to_list()
train_colors = train_set['Color'].to_list()

In [2]:
# Creating zero vectors with numpy
import numpy

# assigning the list of terms to a variable
terms = ["tannin", "cherry", "oak", "fresh", "vanilla", "rich", "blackberry", "very", "dry", "spice"]

# Creating zero vectors of length 10 for each review in our training set
# numpy.zeros(X, Y) 
# X = no. of rows, Y = no. of columns
train_features = numpy.zeros((len(train_reviews), len(terms)))
print(train_features.shape)

(10000, 10)


In [3]:
# Using spacy to look at the lemmas of words (tokens) in the review and compare them with terms from the terms list
import spacy

# Using small language model
nlp = spacy.load('en_core_web_sm')

# Processing a text
train_doc_reviews = nlp.pipe(train_reviews)


# looping over each review, label and feature vector at the same time (zip)
for review, f, c in zip(train_doc_reviews, train_features, train_colors):
    tokens_list = [token.lemma_ for token in review]
    #print(tokens_list)
    for term in terms:
        if term in tokens_list:
            term_id = terms.index(term)
            f[term_id] = 1 # replacing the vector value with 1 at the index where the term is in the review

In [4]:
# Importing LogisticRegression model from scikit learn
from sklearn.linear_model import LogisticRegression

# Making an instance of the Model from LogisticRegression class
# all parameters not specified are set to their defaults
lr = LogisticRegression()

# Training the model on the data, storing the information learned from the data
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr.fit(train_features, train_colors)

# Let's see what are the possible labels to predict (and in which order they are stored)
print(lr.classes_)

# We can get additional information about all the parameters used with LogReg model
print(lr.get_params())

['Red' 'Rose' 'White' 'unk']
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [7]:
# Openning test dataset with pandas.read_csv
import pandas
test_set = pandas.read_csv('./test.csv', sep='\t', encoding='utf-8')
test_set


# Extracting only the relevant columns, and puting them in lists
test_reviews = test_set['Review'].to_list()
test_colors = test_set['Color'].to_list()

# Creating zero vectors (of length 10) for each review (of len(test_reviews))
test_features = numpy.zeros((len(test_reviews), len(terms)))



In [8]:
# Using spacy to process the text and see if the lemmas of the words in reviews match the terms from the term list
import spacy

nlp = spacy.load('en_core_web_sm')

# Processing a text
test_doc_reviews = nlp.pipe(test_reviews)

# Updating the feature vectors by checkin if the terms exist per review
for review, f_vector in zip(test_doc_reviews, test_features):
    tokens_list = [token.lemma_ for token in review]
    for term in terms:
        if term in tokens_list:
            term_id = terms.index(term)
            f_vector[term_id]=1

In [11]:
# Defining a punction predict which will take a review index as input and prints information about correct label and predicted label
def predict(i):
    # printing the review of the index
    print(test_reviews[i])
    # printing the features of the index
    print(test_features[i])
    # printing all terms
    print(terms)
    # printing the correct label of the index
    print(test_colors[i])

    print()
    print("Prediction:")
    # printing the prediction for the features of this index
    print(lr.predict([test_features[i]]))
    # printing the probabilities for each label predictions
    print(lr.predict_proba([test_features[i]]))
    print()

In [12]:
# Calling the function predict to predict the label for the reviews at indexes 0 and 10

predict(0)
predict(10)

Leather, spice, tobacco and tea emerge from the nose of this Sicilian blend of Nero d’Avola, Syrah, Merlot, Cabernet and Petit Verdot. You’ll get aromas of clove, allspice and vanilla behind vibrant blueberry and raspberry.
[0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
['tannin', 'cherry', 'oak', 'fresh', 'vanilla', 'rich', 'blackberry', 'very', 'dry', 'spice']
Red

Prediction:
['Red']
[[0.44869982 0.02516404 0.44414481 0.08199132]]

I haven’t been a fan of Santa Ynez Cabs for the simple reason that they’re so seldom ripe. You get this green, herb and mint streak that’s not flattering to Cab’s tannins. This wine is in that vein. 
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
['tannin', 'cherry', 'oak', 'fresh', 'vanilla', 'rich', 'blackberry', 'very', 'dry', 'spice']
Red

Prediction:
['Red']
[[0.94900612 0.00379494 0.01881589 0.02838305]]



In [13]:
# saving the model to a binary file
import pickle


# Saving to file in the current working directory
pkl_filename = "lr_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lr, file)

# Loading from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Let's see what are the possible labels to predict (and in which order they are stored)
print(pickle_model.classes_)

# We can get additional information about all the parameters used with LogReg model
print(pickle_model.get_params())

['Red' 'Rose' 'White' 'unk']
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
