In [1]:
import csv
import re
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from BeautifulSoup import BeautifulSoup
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize

def write_submission(filename, predicted_results):
    if not os.path.exists('submission'):
        os.makedirs('submission')
    np.savetxt('submission/' + filename, predicted_results, fmt='%.5f')
    print(filename + ' updated!')

stop = set(stopwords.words('english'))
word_parser = RegexpTokenizer('[A-Za-z]+', flags=re.UNICODE)
digit_checker = re.compile("\d")

In [2]:
df_train = pd.read_csv(filepath_or_buffer='data/training/data_train.csv', 
                 names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

In [3]:
df_train.head()

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type
0,my,AD674FAASTLXANMY,Adana Gallery Suri Square Hijab – Light Pink,Fashion,Women,Muslim Wear,<ul><li>Material : Non sheer shimmer chiffon</...,49.0,local
1,my,AE068HBAA3RPRDANMY,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Health & Beauty,Bath & Body,Hand & Foot Care,Formulated with oil-free hydrating botanicals/...,128.0,international
2,my,AN680ELAA9VN57ANMY,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,<ul> <li>150cm mini microphone compatible for ...,25.07,international
3,my,AN957HBAAAHDF4ANMY,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,Health & Beauty,Hair Care,Shampoos & Conditioners,<ul> <li>ANMYNA Complaint Silky Set (Shampoo 5...,118.0,local
4,my,AR511HBAXNWAANMY,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Body and Skin Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,114.8,international


# Predict Clarity

## Exploration

In [4]:
def tokenize_description(description):
    description = BeautifulSoup(description)
    description = description.getText(' ')
    
    tokens = word_parser.tokenize(description)
    
    tokens = [token.lower() for token in tokens]
    tokens = list(filter(lambda t: t not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u'x'], tokens))
    
    return tokens

def tokenize_title(title):
    try:
        title = ''.join(i for i in title if ord(i)<128)
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(title)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
        
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Exception as e:
        print(e)
        
def extract_title_features(titles):
    title_features = np.zeros((len(titles),3))
    
    for index in xrange(len(titles)):
        title = titles[index]
        
        # Check if title contains non-ascii characters
        try:
            title.decode('ascii')
        except:
            title_features[index,0] = 1.
            
        # Compute the number of tokens title contains
        tokens = tokenize_title(title)
        title_features[index,1] = len(title)
        
        # Check if title contains a number
        if digit_checker.search(title):
            title_features[index,2] = 1.
    
    return title_features

In [5]:
# PERFOM tf-idf ON DESCRIPTIONS
tf_idf_desription = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_description, ngram_range=(1, 2))

descriptions = df_train['short_description'].replace(np.nan, '')
descriptions = tf_idf_desription.fit_transform(list(descriptions))

In [6]:
# PERFOM tf-idf ON TITLE
tf_idf_title = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_title, ngram_range=(1, 2))

titles = tf_idf_title.fit_transform(list(df_train['title']))
df_train['tokenized_title'] = df_train['title'].map(tokenize_title)

In [7]:
# Check tokenization for first 5 listings
for title, tokens in zip(df_train['title'].head(5), df_train['tokenized_title'].head(5)):
    print 'title:', title
    print'tokens:', tokens
    print

title: Adana Gallery Suri Square Hijab – Light Pink
tokens: ['adana', 'gallery', 'suri', 'square', 'hijab', 'light', 'pink']

title: Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz
tokens: ['cuba', 'heartbreaker', 'eau', 'de', 'parfum', 'spray', '100ml/3.3oz']

title: Andoer 150cm Cellphone Smartphone Mini Dual-Headed Omni-Directional Mic Microphone with Collar Clip for iPad iPhone5 6s 6 Plus Smartphones
tokens: ['andoer', '150cm', 'cellphone', 'smartphone', 'mini', 'dual-headed', 'omni-directional', 'mic', 'microphone', 'collar', 'clip', 'ipad', 'iphone5', '6s', 'plus', 'smartphones']

title: ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520ml + Conditioner 250ml)
tokens: ['anmyna', 'complaint', 'silky', 'set', 'shampoo', '520ml', 'conditioner', '250ml']

title: Argital Argiltubo Green Clay For Face and Body 250ml
tokens: ['argital', 'argiltubo', 'green', 'clay', 'face', 'body', '250ml']



In [7]:
# Extract other features of titles
title_features = extract_title_features(df_train['title'])

In [8]:
# CONSTRUCT INPUTS AND OUTPUTS
X = np.concatenate([titles.toarray(),
                    title_features,
                    descriptions.toarray(),
                    pd.get_dummies(df_train['category_lvl_1']).as_matrix(), 
                    pd.get_dummies(df_train['category_lvl_2']).as_matrix(),
                    pd.get_dummies(df_train['category_lvl_3']).as_matrix(),
                    pd.get_dummies(df_train['product_type']).as_matrix(),
                    df_train['price'].as_matrix().reshape(-1,1),
                    (df_train.product_type == 'local').as_matrix().astype(float).reshape(-1,1)
                   ], 
                   axis=1)

y = pd.read_csv("data/training/clarity_train.labels", header=None).as_matrix().ravel()

In [9]:
# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# TRAIN AND EVALUATE THE MODEL
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.211910


## Submission

In [11]:
df_valid = pd.read_csv(filepath_or_buffer='data/validation/data_valid.csv', 
                       names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

In [12]:
# PERFOM tf-idf ON DESCRIPTIONS
descriptions = df_valid['short_description'].replace(np.nan, '')
descriptions = tf_idf_desription.transform(list(descriptions))

In [13]:
# PERFOM tf-idf ON TITLE
titles = tf_idf_title.transform(list(df_valid['title']))
df_valid['tokenized_title'] = df_valid['title'].map(tokenize_title)

In [14]:
# Extract other features of titles
title_features = extract_title_features(df_valid['title'])

In [15]:
# CONSTRUCT INPUTS AND OUTPUTS
X_valid = np.concatenate([titles.toarray(),
                          title_features,
                          descriptions.toarray(),
                          pd.get_dummies(df_valid['category_lvl_1']).as_matrix(), 
                          pd.get_dummies(df_valid['category_lvl_2']).as_matrix(),
                          pd.get_dummies(df_valid['category_lvl_3']).as_matrix(),
                          pd.get_dummies(df_valid['product_type']).as_matrix(),
                          df_valid['price'].as_matrix().reshape(-1,1),
                          (df_valid.product_type == 'local').as_matrix().astype(float).reshape(-1,1)
                         ], 
                         axis=1)

In [16]:
# RETRAIN THE MODEL ON THE WHOLE DATASET
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('clarity_valid.predict', predicted_results)

clarity_valid.predict updated!


# Predict Conciseness

## Exploration

In [17]:
# CONSTRUCT INPUTS AND OUTPUTS
y = pd.read_csv("data/training/conciseness_train.labels", header=None).as_matrix().ravel()

In [18]:
# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
# TRAIN AND EVALUATE THE MODEL
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.354889


## Submission

In [20]:
# RETRAIN THE MODEL ON THE WHOLE DATASET
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('conciseness_valid.predict', predicted_results)

conciseness_valid.predict updated!


# Test Code

In [21]:
os.system('zip -j submission submission/*')

0