Implementation on new data 

In [20]:
import re
import sys
import csv
import time
import json

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
from unidecode import unidecode

from collections import defaultdict
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
STOPWORDS = '''
    het de deze
    en of om te hier nog ook al
    in van voor mee per als tot uit bij
    waar waardoor waarvan wanneer
    je uw ze zelf jezelf
    ca bijv bijvoorbeeld
    is bevat hebben kunnen mogen zullen willen
    gemaakt aanbevolen
    belangrijk belangrijke heerlijk heerlijke handig handige dagelijks dagelijkse
    gebruik allergieinformatie bijdrage smaak hoeveelheid
'''.split()

In [3]:
selected_product_nuts_with_usage = '/Users/elise/Documents/?/? data/new/product_nuts_with_usage_and_product_id.csv'

In [4]:
data_pn_usage = []

with open(selected_product_nuts_with_usage) as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in tqdm_notebook(readCSV):
        data_pn_usage.append(eval(row[0]))




In [5]:
def clean(s):
    if s is None: return None
    # @todo keep '0.50%' and the like (or extract separately) - relevant for alcohol-free
    s = unidecode(s).strip()
    s = re.sub(r'[^A-Za-z0-9\'\s]', '', s, flags=re.MULTILINE)
    s = re.sub(r'\s+', ' ', s, flags=re.MULTILINE)
    return s

In [6]:
def get_brand_name(j):
    '''Return brand name from brand_name or brand_url'''
    s = j.get('brand_name', '').strip()
    if s == '':
        s = j.get('brand_url', '').strip()
        s = re.sub(r'(\Ahttps?://(www\.)?|\Awww\.|\.\w{2,3}\/?\Z)', '', s, flags=re.MULTILINE|re.IGNORECASE)
    return s

In [7]:
def f_name(j):
    f = clean(j.get('name', '').lower())
    # strip brand from front of name, would be twice featurized
    brand_name_clean = clean(get_brand_name(j).lower())
    if brand_name_clean != '' and f.startswith(brand_name_clean):
        f = f[len(brand_name_clean):].strip()

    if f == '': return []
    return f.split()

In [8]:
def f_brand(j):
    f = clean(get_brand_name(j))

    if f == '': return []
    return ['BRN:' + f]

In [9]:
def f_first_ingredient(j):
    if 'ingredients' not in j or len(j['ingredients']) == 0: return []

    f = j['ingredients'][0].strip().lower()

    # we're more interested in whether the ingredient is composed, than its exact content
    if re.search(r'[({:;,\n]', f, flags=re.MULTILINE):
        f = '(COMPOSED)'

    f = clean(f)

    if f == '': return []
    return ['ING:' + f]

In [10]:
def tokenize(j):
    '''Returns array of tokens for product nut dict'''
    return f_name(j) + f_brand(j) + f_first_ingredient(j)

In [11]:
id_tokens = []

for j in tqdm_notebook(data_pn_usage):
    
    tokens = tokenize(j)
    tokens = [s for s in tokens if s not in set(STOPWORDS) and len(s) > 1]

    id_tokens.append({'id': j['id'], 'tokens': tokens, 'usage':j['usage'], 'product_id':j['product_id']})

  _warn_if_not_unicode(string)





What needs to be done

- remove doubles from nuts products
- check that we only include nuts product with wich the usage is linked to at least three products

In [126]:
# removes doubles
tokens = [' '.join(x['tokens']) for x in id_tokens]
ct = Counter(tokens)

no_doubles = []
for x in tqdm_notebook(id_tokens):
    if ct[' '.join(x['tokens'])] == 1:
        no_doubles.append(x)




In [13]:
# makes a dict where each tuple has one usage and all the product_ids linked to it
k = [{x['usage']: x['product_id']} for x in no_doubles]
dd = defaultdict(list)

for d in tqdm_notebook(k): 
    for key, value in d.iteritems():
        dd[key].append(value)




In [14]:
# checks if a usage is linked to at least 3 products
allowed_usage = []
for x in tqdm_notebook(dd.items()):
    if len(set(x[1])) > 2:
        allowed_usage.append(x[0])
        
set_allowed_usage = set(allowed_usage)

complete = []
for x in tqdm_notebook(no_doubles):
    if x['usage'] in set_allowed_usage:
        complete.append(x)





X

In [99]:
text_per_item_pn = [' '.join(tokens['tokens']) for tokens in complete]

In [114]:
vectorizer = CountVectorizer(min_df=1, binary=True)
X = vectorizer.fit_transform(text_per_item_pn)

Y

In [115]:
Y = [x['usage'] for x in complete]

Cross-Validation

In [116]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2, random_state=7)

In [117]:
print 'X: ', X.shape, 'Y: ', len(Y)
print 'X: ', X_train.shape, 'Y: ', len(Y_train)
print 'X: ', X_validation.shape, ' Y: ', len(Y_validation)

X:  (109457, 26555) Y:  109457
X:  (87565, 26555) Y:  87565
X:  (21892, 26555)  Y:  21892


In [118]:
clf = LinearSVC(random_state = 2, verbose = 1)
scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='recall_macro') 

print
print "recall macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Accuracy: 0.78 (+/- 0.02)


Validate

In [119]:
clf.fit(X_train, Y_train)
predictions = clf.predict(X_validation)

[LibLinear]

In [122]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.encode('ascii', 'ignore').split('\n')
    #for line in (lines[2:-3] + [lines[-2]]):
    for line in (lines[2:-3] + [lines[-2]]):
        row = {}
        row_data = line.strip().split('  ')
        row_data = [x for x in row_data if x != '']
        row['class'] = row_data[0]
        row['precision'] = row_data[1]
        row['recall'] = row_data[2]
        row['f1_score'] = row_data[3]
        row['support'] = row_data[4]
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    #dataframe.to_csv('classification_report.csv', index = False)
    return dataframe

report = classification_report(Y_validation, predictions)
df = classifaction_report_csv(report)

In [120]:
df

Unnamed: 0,class,f1_score,precision,recall,support
0,Aardappel - Friet/patat,0.91,0.85,0.98,48
1,Aardappel - gratin met kaas,0.80,1.00,0.67,3
2,Aardappel - hele,0.92,1.00,0.86,21
3,Aardappel - zoete,1.00,1.00,1.00,4
4,Aardappelkroketten,0.86,0.75,1.00,3
5,Aardappelpuree,0.92,1.00,0.86,7
6,"Aardappelpuree, mix voor",1.00,1.00,1.00,21
7,Aardappelsalade,0.93,0.93,0.93,15
8,Aardappelschijfjes/krieltjes/partjes gekruid,0.86,0.91,0.81,59
9,Aardappelschijfjes/krieltjes/partjes ongekruid,0.40,0.33,0.50,4


In [121]:
print 'macro recall score: ', recall_score(Y_validation, predictions, average = 'macro')
print 'micro recall score: ', recall_score(Y_validation, predictions, average = 'micro')

macro recall score:  0.786306233157
micro recall score:  0.855837748949


- Reliable Recall Macro Score

In [125]:
report_data = []
lines = report.encode('ascii', 'ignore').split('\n')
for line in (lines[2:-3] + [lines[-2]]):
    row = {}
    row_data = line.strip().split('  ')
    row_data = [x for x in row_data if x != '']
    row['class'] = row_data[0]
    row['precision'] = row_data[1]
    row['recall'] = row_data[2]
    row['f1_score'] = row_data[3]
    row['support'] = row_data[4]
    report_data.append(row)
    
    
recall = []
for d in report_data:
    if d['support'].strip() != '0':
        recall.append(d['recall'])
        
n_recall = [float(x) for x in recall]
print 'macro recall score: ', sum(n_recall) / len(recall)

macro recall score:  0.80061963775
