In [1]:
# CELL 1
import pandas

# Let's load the training data from a csv file
train_set = pandas.read_csv('./train.csv', sep='\t', encoding='utf-8')
train_set

# Let's extract only these two columns from the data 
train_reviews = train_set['Review'].to_list()
train_colors = train_set['Color'].to_list()

In [2]:
# CELL 2
import numpy

# assign the list of terms to a variable
terms2 = ["fall", "sweet", "plum"]

# Let's create zero vectors of length 10 for each review in our training set
# numpy.zeros(X, Y) let's us do this easily
# X = no. of rows, Y = no. of columns
train_features2 = numpy.zeros((len(train_reviews), len(terms2)))
print(train_features2.shape)

(10000, 3)


In [3]:
# CELL 3
import spacy

nlp = spacy.load('en_core_web_sm')

# Process a text
train_doc_reviews = nlp.pipe(train_reviews)


# loop over each review, label and feature vector at the same time (zip)
for review, f, c in zip(train_doc_reviews, train_features2, train_colors):
    tokens_list = [token.lemma_ for token in review]
    #print(tokens_list)
    for term in terms2:
        if term in tokens_list:
            term_id = terms2.index(term)
            f[term_id] = 1

In [4]:
#In sklearn, all machine learning models are implemented as Python classes
from sklearn.linear_model import LogisticRegression

# Make an instance of the Model from LogisticRegression class
# all parameters not specified are set to their defaults
lr = LogisticRegression()

# Train the model on the data, storing the information learned from the dat`a
# Model is learning the relationship between digits (x_train) and labels (y_train)
lr.fit(train_features2, train_colors)

# Let's see what are the possible labels to predict (and in which order they are stored)
print(lr.classes_)

# We can get additional information about all the parameters used with LogReg model
print(lr.get_params())

['Red' 'Rose' 'White' 'unk']
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


CELL 5: 
- Open test.csv with pandas and assign reviews and colors to two lists
- Create zero feature vectors for the test set
- Use a new variable name for these features
    

In [5]:
import pandas
test_set = pandas.read_csv('./test.csv', sep='\t', encoding='utf-8')
test_set


# Let's extract only the relevant columns, and put them in lists
test_reviews = test_set['Review'].to_list()
test_colors = test_set['Color'].to_list()

# Create zero vectors (of length 10) for each review (of len(test_reviews))

test_features2 = numpy.zeros((len(test_reviews), len(terms2)))



CELL 6:
- Process the test set and update feature vectors



In [6]:
# CELL 6:
import spacy

nlp = spacy.load('en_core_web_sm')

# Process a text
test_doc_reviews = nlp.pipe(test_reviews)

# Update the feature vectors by checkin if the terms exist per review
for review, f_vector in zip(test_doc_reviews, test_features2):
    tokens_list = [token.lemma_ for token in review]
    for term in terms2:
        if term in tokens_list:
            term_id = terms2.index(term)
            f_vector[term_id]=1

CELL 7:
- Define a function predict2, which takes a review index as input and prints:
    - the review
    - the features
    - the terms
    - the correct label
    - the prediction (using the trained model)
    - the probability of each class being predicted (using the trained model)

In [7]:
# CELL 7:
def predict(i):
    # Print the review of the index
    print(test_reviews[i])
    # print the features of the index
    print(test_features2[i])
    # print all terms
    print(terms2)
    # print the correct label of the index
    print(test_colors[i])

    print()
    print("Prediction:")
    # print the prediction for the features of this index
    print(lr.predict([test_features2[i]]))
    # print the probabilities for each label predictions
    print(lr.predict_proba([test_features2[i]]))
    print()

CELL 8
- Using the function predict2, predict the labels for review 0 and review 10


In [8]:
# CELL 8

In [9]:
predict(0)
predict(10)

Leather, spice, tobacco and tea emerge from the nose of this Sicilian blend of Nero d’Avola, Syrah, Merlot, Cabernet and Petit Verdot. You’ll get aromas of clove, allspice and vanilla behind vibrant blueberry and raspberry.
[0. 0. 0.]
['fall', 'sweet', 'plum']
Red

Prediction:
['Red']
[[0.58717328 0.02166135 0.32938762 0.06177775]]

I haven’t been a fan of Santa Ynez Cabs for the simple reason that they’re so seldom ripe. You get this green, herb and mint streak that’s not flattering to Cab’s tannins. This wine is in that vein. 
[0. 0. 0.]
['fall', 'sweet', 'plum']
Red

Prediction:
['Red']
[[0.58717328 0.02166135 0.32938762 0.06177775]]

