In [1]:
import csv
import re
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize

def write_submission(filename, predicted_results):
    if not os.path.exists('submission'):
        os.makedirs('submission')
    np.savetxt('submission/' + filename, predicted_results, fmt='%.5f')
    print(filename + ' updated!')

stop = set(stopwords.words('english'))
word_parser = RegexpTokenizer('[A-Za-z]+', flags=re.UNICODE)
digit_checker = re.compile("\d")

Features:
country
price
level 1 category
level 2 category
title keyword density
title length
has duplicate tokens
sum of tf-idf values per title
average tf-idf value


In [2]:
def tokenize_description(description):
    description = BeautifulSoup(description, "html5lib")
    description = description.getText(' ')
    
    tokens = word_parser.tokenize(description)
    
    tokens = [token.lower() for token in tokens]
    tokens = list(filter(lambda t: t not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u'x'], tokens))
    
    return tokens

def tokenize_title(title):
    try:
        title = ''.join(i for i in title if ord(i)<128)
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(title)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
        
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    
    except Exception as e:
        print(e)
        
def extract_title_features(titles):
    title_features = np.zeros((len(titles),3))
    
    for index in range(len(titles)):
        title = titles[index]
        
        # Check if title contains non-ascii characters
        try:
            title.decode('ascii')
        except:
            title_features[index,0] = 1.
            
        # Compute the number of tokens title contains
        tokens = tokenize_title(title)
        title_features[index,1] = len(title)
        
        # Check if title contains a number
        if digit_checker.search(title):
            title_features[index,2] = 1.
    
    return title_features

        
def has_duplicates(values):
    # For each element, check all following elements for a duplicate.
    for i in range(0, len(values)):
        for x in range(i + 1, len(values)):
            if values[i] == values[x]:
                return 1
    return 0

def calculate_tfidf_sum(tokens):
    score = 0
    for token in tokens:
        if tfidf.tfidf.get(token) is not None:
            score = score + tfidf.tfidf.get(token)
    return score

def calculate_tfidf_mean(tokens):
    score = 0
    for token in tokens:
        if tfidf.tfidf.get(token) is not None:
            score = score + tfidf.tfidf.get(token)
    if len(tokens)>1:
        mean = score/len(tokens)
    else:
        mean = 0
    return mean

def keyword_density(title):
    count = 0
    length = len(title)
    for word in title:
        if word in tfidf.index:
            count += 1
    density = count/length if count>1 else 0
    return density

In [3]:
df_train = pd.read_csv(filepath_or_buffer='data/training/data_train.csv', 
                 names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

In [4]:
# Compute tf-idf on descriptions
tf_idf_desription = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_description, ngram_range=(1, 2))

descriptions = df_train['short_description'].replace(np.nan, '')
descriptions = tf_idf_desription.fit_transform(list(descriptions))

  'Beautiful Soup.' % markup)


In [5]:
# Compute tf-idf on titles
tf_idf_title = TfidfVectorizer(min_df=10, max_features=10000, tokenizer=tokenize_title, ngram_range=(1, 2))

titles = tf_idf_title.fit_transform(list(df_train['title']))
df_train['tokenized_title'] = df_train['title'].map(tokenize_title)

tfidf = dict(zip(tf_idf_title.get_feature_names(), tf_idf_title.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [6]:
# Compute title keyword density
titleKeywordDensity = df_train['tokenized_title'].map(keyword_density)

# Compute title length
titleLength = df_train['tokenized_title'].map(len)

# Compute new feature - 1 if title has duplicates, 0 if not.
hasDuplicateTokens = df_train['tokenized_title'].map(has_duplicates)

# Compute tf-idf sum
tfidfSum = df_train['tokenized_title'].map(calculate_tfidf_sum)

# Compute tf-idf average
tfidfAvg = df_train['tokenized_title'].map(calculate_tfidf_mean)

In [7]:
# Extract other features of titles
title_features = extract_title_features(df_train['title'])

In [9]:
# CONSTRUCT INPUTS AND OUTPUTS
X = np.concatenate([titles.toarray(),
                    title_features,
                    descriptions.toarray(),
                    titleKeywordDensity.as_matrix().reshape(-1,1),
                    titleLength.as_matrix().reshape(-1,1),
                    hasDuplicateTokens.as_matrix().reshape(-1,1),
                    tfidfSum.as_matrix().reshape(-1,1),
                    tfidfAvg.as_matrix().reshape(-1,1),
                    pd.get_dummies(df_train['category_lvl_1']).as_matrix(), 
                    pd.get_dummies(df_train['category_lvl_2']).as_matrix(),
                    pd.get_dummies(df_train['category_lvl_3']).as_matrix(),
                    pd.get_dummies(df_train['product_type']).as_matrix(),
                    df_train['price'].as_matrix().reshape(-1,1),
                    (df_train.product_type == 'local').as_matrix().astype(float).reshape(-1,1)
                   ], 
                   axis=1)

y = pd.read_csv("data/training/clarity_train.labels", header=None).as_matrix().ravel()

Predict Clarity

In [10]:
# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train and evaluate the model
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.210334


In [11]:
# Prepare submission

df_valid = pd.read_csv(filepath_or_buffer='data/validation/data_valid.csv', 
                       names=['country','sku_id','title','category_lvl_1','category_lvl_2','category_lvl_3','short_description','price','product_type'])

# Perform tf-idf of descriptions
descriptions = df_valid['short_description'].replace(np.nan, '')
descriptions = tf_idf_desription.transform(list(descriptions))


# Compute tf-idf of title
titles = tf_idf_title.transform(list(df_valid['title']))
df_valid['tokenized_title'] = df_valid['title'].map(tokenize_title)

tfidf = dict(zip(tf_idf_title.get_feature_names(), tf_idf_title.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']


# Compute title keyword density
titleKeywordDensity = df_valid['tokenized_title'].map(keyword_density)

# Compute title length
titleLength = df_valid['tokenized_title'].map(len)

# Compute new feature - 1 if title has duplicates, 0 if not.
hasDuplicateTokens = df_valid['tokenized_title'].map(has_duplicates)

# Compute tf-idf sum
tfidfSum = df_valid['tokenized_title'].map(calculate_tfidf_sum)

# Compute tf-idf average
tfidfAvg = df_valid['tokenized_title'].map(calculate_tfidf_mean)

# Extract other features of titles
title_features = extract_title_features(df_valid['title'])

# Construct inputs and outputs
X_valid = np.concatenate([titles.toarray(),
                          title_features,
                          descriptions.toarray(),
                          titleKeywordDensity.as_matrix().reshape(-1,1),
                          titleLength.as_matrix().reshape(-1,1),
                          hasDuplicateTokens.as_matrix().reshape(-1,1),
                          tfidfSum.as_matrix().reshape(-1,1),
                          tfidfAvg.as_matrix().reshape(-1,1),
                          pd.get_dummies(df_valid['category_lvl_1']).as_matrix(), 
                          pd.get_dummies(df_valid['category_lvl_2']).as_matrix(),
                          pd.get_dummies(df_valid['category_lvl_3']).as_matrix(),
                          pd.get_dummies(df_valid['product_type']).as_matrix(),
                          df_valid['price'].as_matrix().reshape(-1,1),
                          (df_valid.product_type == 'local').as_matrix().astype(float).reshape(-1,1)
                         ], 
                         axis=1)


# Retrain the model on the whole dataset
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('clarity_valid.predict', predicted_results)

  'Beautiful Soup.' % markup)


clarity_valid.predict updated!


Predict Conciseness

In [12]:
# CONSTRUCT INPUTS AND OUTPUTS
y = pd.read_csv("data/training/conciseness_train.labels", header=None).as_matrix().ravel()

# SPLIT INTO TRAINING SET AND VALIDATION SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# TRAIN AND EVALUATE THE MODEL
model = LogisticRegression()
model.fit(X_train, y_train)

print("Model RMSE: %f" % mean_squared_error(model.predict_proba(X_test)[:,1], y_test)**0.5)

Model RMSE: 0.343862


In [13]:
# RETRAIN THE MODEL ON THE WHOLE DATASET
model = LogisticRegression()
model.fit(X, y)

predicted_results = model.predict_proba(X_valid)[:, 1]
write_submission('conciseness_valid.predict', predicted_results)

conciseness_valid.predict updated!


In [14]:
os.system('zip -j submission submission/*')

0