# Model training

Author: Felicia Liu

Date: August 14, 2019

In [127]:
from copy import deepcopy
import json
import pprint
import random

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from bs4 import BeautifulSoup
import numpy as np
import textacy
from textacy import preprocess
from textacy.vsm import vectorizers
import textacy.tm

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import label_propagation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
%matplotlib inline

## Data set

In [77]:
with open('product_data_handcorrect_2_100.json', 'r') as file:
    data = json.load(file)

In [112]:
len(product_descriptions)

1000

Retrieve all data (100 items hand-labeled, 900 items pre-labeled with topic modeling).

In [78]:
product_descriptions = []
raw_labels = []
for index, product in enumerate(data):
    if 'description' in product:
        product_descriptions.append(product['description'])
    else:
        print("Missing description for product {}".format(index))
    if 'label' in product:
        raw_labels.append(product['label'])
    else:
        print("Missing label")
        raw_labels.append('Other')

In [None]:
# raw_labels

In [94]:
# Mapping from category to integer
label_mapping = {"Dress": 0, "Dresses": 0, "Tops": 1, "Jeans": 2, "Skirts": 3, "Rompers": 4, "Shoes": 5, "Bags": 6, "Jewelry": 7, " Jewelry": 7, "Swimwear": 8, "Intimates": 9, "Other": 10, "Accessories": 10, " Accessories": 10, '': 10}

In [95]:
num_labels = [label_mapping[label] for label in raw_labels]

In [None]:
# num_labels

In [113]:
labeled_product_descriptions = product_descriptions[:100]
hand_labels = num_labels[:100]

In [114]:
rest_product_descriptions = product_descriptions[100:]
tm_labels = num_labels[100:]

In [115]:
rng = np.random.RandomState(0)
indices = np.arange(len(rest_product_descriptions))
rng.shuffle(indices)

In [116]:
n_total_samples = len(num_labels)
n_labeled_points = 100
max_iterations = 5

In [117]:
unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
f = plt.figure()

<Figure size 432x288 with 0 Axes>

## Convert text data into vector representation

In [108]:
def preprocess_descriptions(descriptions, filter_words):
    preprocessed_descriptions = []
    for description in descriptions:
        if bool(BeautifulSoup(description, "html.parser").find()):
            soup = BeautifulSoup(description)
            description = soup.text
        preprocessed = description.replace('\n', ' ')
        preprocessed = preprocessed.lower()
        for word in filter_words:
            preprocessed = preprocessed.replace(word.lower(), ' ')
        preprocessed = preprocess.normalize_whitespace(preprocessed)
        preprocessed_descriptions.append(preprocessed)
    return preprocessed_descriptions

In [109]:
filter_words = ['J.Crew', 'shipping', 'free', 'available', 'entire', 'selection']

In [110]:
preprocessed_descriptions = preprocess_descriptions(product_descriptions, filter_words)

In [111]:
def vectorize_text(input_texts):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(input_texts)
    print(vectors.shape)
    return vectors

In [106]:
vectors = vectorize_text(preprocessed_descriptions)

(1000, 5281)


## Trying label spreading

Spent a lot of time to try to make this work, but unfortunately ran out of time.

In [122]:
for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("No unlabeled items left to labe.")
        break
    y_train = np.copy(num_labels)
    y_train[unlabeled_indices] = -1
    y_train = y_train
    print(y_train.shape)
    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=20)
    print(vectors.shape)
    vectors = vectors.toarray()
    lp_model.fit(vectors, y_train)
    
    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]
    
    cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
    
    print(classification_report(true_labels, predicted_labels))
    print(cm)
    
    predicted_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
    uncertainty_ind = np.argsort(predicted_entropies)[::-1]
    uncertainty_ind = uncertainty_ind[np.in1d(uncertainty_index, unlabeled_indices)][:5]
    # ...

(1000,)
(1000, 5281)


IndexError: index 330 is out of bounds for axis 0 with size 330

## Baseline logistic regression model

Train with 100 hand-labeled data points.

In [126]:
preprocessed_labeled = preprocess_descriptions(labeled_product_descriptions, filter_words)

In [128]:
vectorizer = TfidfVectorizer()
vectors_labeled = vectorizer.fit_transform(preprocessed_labeled)

Vectorize the remaining 900 data points.

In [130]:
preprocessed_rest = preprocess_descriptions(rest_product_descriptions, filter_words)

In [132]:
vectors_rest = vectorizer.transform(preprocessed_rest)

In [129]:
model = LogisticRegression()
model.fit(vectors_labeled, hand_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Produce predictions and write to json

In [133]:
preds = model.predict(vectors_rest)

In [135]:
all_labels_final = list(hand_labels) + list(preds)

In [None]:
label_mapping = {"Dress": 0, "Dresses": 0, "Tops": 1, "Jeans": 2, "Skirts": 3, "Rompers": 4, "Shoes": 5, "Bags": 6, "Jewelry": 7, " Jewelry": 7, "Swimwear": 8, "Intimates": 9, "Other": 10, "Accessories": 10, " Accessories": 10, '': 10}

In [136]:
reverse_mapping = dict([[v, k] for k,v in label_mapping.items()])

In [138]:
reverse_mapping[7] = 'Jewelry'
reverse_mapping[10] = 'Other'

In [139]:
data_copy = deepcopy(data)

for product_data, label in zip(data_copy, all_labels_final):
    product_data['label'] = reverse_mapping[label]

In [140]:
with open('product_data_result.json', 'w') as outfile:
    json.dump(data_copy, outfile, indent=4)