In [1]:
import pandas as pd
import gensim
import re
import numpy as np

In [51]:
headers = "country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description, price,product_type".split(",")

# Training data
data_train = pd.read_csv("./data/training/data_train.csv", names=headers)
clarity_train = pd.read_csv("./data/training/clarity_train.labels", names=['clarity_label'])
conciseness_train = pd.read_csv("./data/training/conciseness_train.labels", names=['conciseness_label'])

data_train['clarity_label'] = clarity_train['clarity_label']
data_train['conciseness_label'] = conciseness_train['conciseness_label']

# Validation
data_validate = pd.read_csv("./data/validation/data_valid.csv", names=headers)

In [56]:
def repair_feature_dataframe(data_train, w2v_model, contain_label=False):
    # Title cleaning
    def clean_title(title):
        title = title.lower()
        title = title.replace("/", " / ")
        title = re.sub(r"[^A-z0-9\s-]", "", title)

        return title

    data_train['title_clean'] = data_train.title.apply(clean_title)
    
    selected_label = ['title_clean']
    if contain_label:
        selected_label += ['clarity_label', 'conciseness_label']
    data_train = data_train[selected_label]
    
    # Word2vec title
    def vector_title(title, w2v_model):
        title = title.split(" ")
        vec = np.zeros(300)
        for w in title:
            try:
                vec += w2v_model[w]
            except:
                pass

        return vec

    data_train['vector_title'] = data_train.title_clean.apply(lambda t: vector_title(t, w2v_model))
    return data_train

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [60]:
data_train = repair_feature_dataframe(data_train, w2v_model, contain_label=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
data_validate = repair_feature_dataframe(data_validate, w2v_model, contain_label=False)

In [59]:
data_train

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type,clarity_label,conciseness_label
0,my,AD674FAASTLXANMY,Adana Gallery Suri Square Hijab – Light Pink,Fashion,Women,Muslim Wear,<ul><li>Material : Non sheer shimmer chiffon</...,49.00,local,1,1
1,my,AE068HBAA3RPRDANMY,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Health & Beauty,Bath & Body,Hand & Foot Care,Formulated with oil-free hydrating botanicals/...,128.00,international,1,1
2,my,AN680ELAA9VN57ANMY,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,<ul> <li>150cm mini microphone compatible for ...,25.07,international,1,0
3,my,AN957HBAAAHDF4ANMY,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,Health & Beauty,Hair Care,Shampoos & Conditioners,<ul> <li>ANMYNA Complaint Silky Set (Shampoo 5...,118.00,local,1,1
4,my,AR511HBAXNWAANMY,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Body and Skin Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,114.80,international,1,1
5,my,AS575ELCMZ4WANMY,Asus TP300LJ-DW004H Transformer Book Flip 4GB ...,Computers & Laptops,Laptops,Traditional Laptops,"<div class=""prod_content""> <div class=""prod_de...",2599.00,local,1,1
6,my,AS727ELAA9LLV1ANMY,NG-40C Ring-Shaped 40W 3166lm 5400K Macro Phot...,Cameras,Camera Accessories,Lighting & Studio Equipment,<ul> <li>1. Color Temperature: 5400K</li> <li>...,388.99,international,1,1
7,my,BU512HBAA4WUVTANMY,Buytra Exfoliating Peel Foot Mask 1Pair,Health & Beauty,Bath & Body,Hand & Foot Care,<ul> <li>Reviving like a new born baby.</li> <...,10.40,international,1,1
8,my,CL787ELAW29LANMY,CLiPtec OCC121 Slim Flat USB 3.0 Extension Cab...,Computers & Laptops,Laptops,Traditional Laptops,"<ul style= ""padding: 0px; margin: 20px 0px 0px...",29.00,local,1,1
9,my,CO633HLAABREKOANMY,McDonald's Coke Can Glass Limited Edition 12oz...,Home & Living,Kitchen & Dining,Tableware,<ul> <li>Genuine issued McDonald's Coca Cola m...,25.00,local,1,1


# Checkpoint training data

In [26]:
# data_train.to_csv("./checkpoint/data_train.csv", index=False)

# SVM Model

In [36]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

## 1. SVC

In [None]:
clf_SVC_clarity = svm.SVC()
clf_SVC_conciseness = svm.SVC()


X = data_train.vector_title.tolist()
y_clarity_label = data_train.clarity_label.tolist()
y_conciseness_label = data_train.conciseness_label.tolist()
data_validate_input = data_validate.vector_title.tolist()

# clarity_label ===================
X_train, X_test, y_train, y_test = train_test_split(X, y_clarity_label, test_size=0.1, random_state=42)
clf_SVC_clarity.fit(X_train, y_train)
y_pred = clf_SVC_clarity.predict(X_test)
print 'Accuracy clf_SVC_clarity = ', accuracy_score(y_test, y_pred)
# Validation
validate_result_clarity = clf_SVC_clarity.predict(data_validate_input)


In [63]:
# conciseness_label ===================
X_train, X_test, y_train, y_test = train_test_split(X, y_conciseness_label, test_size=0.1, random_state=43)
clf_SVC.fit(X_train, y_train)
clf_SVC_conciseness.fit(X_train, y_train)
y_pred = clf_SVC_conciseness.predict(X_test)
print 'Accuracy clf_SVC_conciseness = ', accuracy_score(y_test, y_pred)

# Validation
validate_result_conciseness = clf_SVC_conciseness.predict(data_validate_input)

Accuracy clf_SVC_conciseness =  0.838523009093


In [67]:
np.savetxt("submit/conciseness_valid.predict", validate_result_conciseness, "%.4f")
np.savetxt("submit/clarity_valid.predict", validate_result_clarity, "%.4f")

## 2. Support Vector Regression

In [39]:
clf_SVR = svm.SVR()
clf_SVR.fit(X_train, y_train) 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [43]:
y_pred = clf_SVR.predict(X_test)
accuracy_score(y_test, y_pred.round(), normalize=False)

0.94245866043093374

# Validation