In [44]:
import csv
import re
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from BeautifulSoup import BeautifulSoup
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import svm

stop = set(stopwords.words('english'))
word_parser = RegexpTokenizer('[A-Za-z]+', flags=re.UNICODE)
digit_checker = re.compile("\d")

In [2]:
def write_submission(filename, predicted_results):
    if not os.path.exists('submission'):
        os.makedirs('submission')
    np.savetxt('submission/' + filename, predicted_results, fmt='%.5f')
    print(filename + ' updated!')

def tokenize_description(description):
    description = BeautifulSoup(description)
    description = description.getText(' ')
    
    tokens = word_parser.tokenize(description)
    
    tokens = [token.lower() for token in tokens]
    tokens = list(filter(lambda t: t not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u'x'], tokens))
    
    return tokens

def tokenize_title(title):
    try:
        title = ''.join(i for i in title if ord(i)<128)
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(title)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
        
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    
    except Exception as e:
        print(e)
        
def has_non_ascii(title):        
    try:
        title.decode('ascii')
        return 0
    except:
        return 1
    
def has_number(title): 
    title = title.decode('utf-8').encode('ascii', errors='ignore')
    
    if digit_checker.search(title):
        return 1
    else:
        return 0      
    
def has_slash(title): 
    if '\\' in title:
        return 1
    else:
        return 0    
        
def has_duplicates(values):
    # For each element, check all following elements for a duplicate.
    for i in range(0, len(values)):
        for x in range(i + 1, len(values)):
            if values[i] == values[x]:
                return 1
    return 0

def calculate_tfidf_sum(tokens):
    score = 0
    for token in tokens:
        if tfidf.tfidf.get(token) is not None:
            score = score + tfidf.tfidf.get(token)
    return score

def calculate_tfidf_avg(tokens):
    score = 0
    for token in tokens:
        if tfidf.tfidf.get(token) is not None:
            score = score + tfidf.tfidf.get(token)
    if len(tokens)>1:
        mean = score/len(tokens)
    else:
        mean = 0
    return mean

def keyword_density(title):
    count = 0
    length = len(title)
    for word in title:
        if word in tfidf.index:
            count += 1
    density = count/length if count>1 else 0
    return density

# Feature Engineering

In [3]:
df_train = pd.read_csv(filepath_or_buffer='data/training/data_train.csv', 
                 names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

In [4]:
df_train.head()

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type
0,my,AD674FAASTLXANMY,Adana Gallery Suri Square Hijab – Light Pink,Fashion,Women,Muslim Wear,<ul><li>Material : Non sheer shimmer chiffon</...,49.0,local
1,my,AE068HBAA3RPRDANMY,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Health & Beauty,Bath & Body,Hand & Foot Care,Formulated with oil-free hydrating botanicals/...,128.0,international
2,my,AN680ELAA9VN57ANMY,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,<ul> <li>150cm mini microphone compatible for ...,25.07,international
3,my,AN957HBAAAHDF4ANMY,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,Health & Beauty,Hair Care,Shampoos & Conditioners,<ul> <li>ANMYNA Complaint Silky Set (Shampoo 5...,118.0,local
4,my,AR511HBAXNWAANMY,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Body and Skin Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,114.8,international


In [6]:
# Compute tf-idf on descriptions
tf_idf_desription = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_description, ngram_range=(1, 2))
tf_idf_desription.fit(list(df_train['short_description'].replace(np.nan, '')))

# Compute tf-idf on titles
tf_idf_title = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_title, ngram_range=(1, 2))
tf_idf_title.fit(list(df_train['title']))

tfidf = dict(zip(tf_idf_title.get_feature_names(), tf_idf_title.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [36]:
def construct_features(data_frame):
    descriptions = tf_idf_desription.transform(list(data_frame['short_description'].replace(np.nan, '')))
    titles = tf_idf_title.transform(list(data_frame['title']))
    data_frame['tokenized_title'] = df_train['title'].map(tokenize_title)
    
    # Compute title length
    if 'token_num' not in data_frame:
        data_frame['token_num'] = data_frame['tokenized_title'].map(len)

    # Compute new feature - 1 if title has duplicates, 0 if not.
    if 'has_duplicate' not in data_frame:
        data_frame['has_duplicate'] = data_frame['tokenized_title'].map(has_duplicates)

    # Compute tf-idf sum
    if 'tfidf_sum' not in data_frame:
        data_frame['tfidf_sum'] = data_frame['tokenized_title'].map(calculate_tfidf_sum)

    # Compute tf-idf average
    if 'tfidf_avg' not in data_frame:
        data_frame['tfidf_avg'] = data_frame['tokenized_title'].map(calculate_tfidf_avg)

    # Compute title length
    if 'title_length' not in data_frame:
        data_frame['title_length'] = data_frame['title'].map(len)

    # Compute new feature - 1 if title has non-ascii character, 0 otherwise
    if 'has_non_ascii' not in data_frame:
        data_frame['has_non_ascii'] = data_frame['title'].map(has_non_ascii)

    # Compute new feature - 1 if title has a number, 0 otherwise
    if 'has_number' not in data_frame:
        data_frame['has_number'] = data_frame['title'].map(has_number)

    # Compute new feature - 1 if title has a backslash, 0 otherwise
    if 'has_backslash' not in data_frame:
        data_frame['has_backslash'] = data_frame['title'].map(has_number)
    
    X = data_frame[['token_num', 
                  'has_duplicate', 
                  'tfidf_sum', 
                  'tfidf_avg', 
                  'title_length', 
                  'has_non_ascii', 
                  'has_number',
                  'has_backslash']]

    X = np.concatenate([X.as_matrix(),
                        titles.toarray(),
                        descriptions.toarray(),
                        pd.get_dummies(data_frame['category_lvl_1']).as_matrix(), 
                        pd.get_dummies(data_frame['category_lvl_2']).as_matrix(),
                        pd.get_dummies(data_frame['category_lvl_3']).as_matrix(),
                        pd.get_dummies(data_frame['product_type']).as_matrix(),
                        data_frame['price'].as_matrix().reshape(-1,1),
                        (data_frame.product_type == 'local').as_matrix().astype(float).reshape(-1,1)
                       ], 
                       axis=1)
    
    return X

# Predict Conciseness

## Cross-validation

In [32]:
X = construct_features(df_train)
y = pd.read_csv("data/training/conciseness_train.labels", header=None).as_matrix().ravel()

In [33]:
# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# TRAIN AND EVALUATE THE MODEL
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.349795


In [None]:
# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# TRAIN AND EVALUATE THE MODEL
model = svm.SVR()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

## Submission

In [37]:
df_valid = pd.read_csv(filepath_or_buffer='data/validation/data_valid.csv', 
                       names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

X_valid = construct_features(df_valid)

In [38]:
# RETRAIN THE MODEL ON THE WHOLE DATASET
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('conciseness_valid.predict', predicted_results)

conciseness_valid.predict updated!


# Predict Clarity

## Cross-validation

In [39]:
# CONSTRUCT INPUTS AND OUTPUTS
y = pd.read_csv("data/training/clarity_train.labels", header=None).as_matrix().ravel()

In [40]:
# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [41]:
# TRAIN AND EVALUATE THE MODEL
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.218032


## Submission

In [42]:
# RETRAIN THE MODEL ON THE WHOLE DATASET
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('clarity_valid.predict', predicted_results)

clarity_valid.predict updated!


# Test Code

In [43]:
os.system('zip -j submission submission/*')

0