In [17]:
import pandas as pd
import gensim
from gensim import models

import re
import numpy as np

In [65]:
headers = "country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description, price,product_type".split(",")

# Training data
data_train = pd.read_csv("./data/training/data_train.csv", names=headers)
clarity_train = pd.read_csv("./data/training/clarity_train.labels", names=['clarity_label'])
conciseness_train = pd.read_csv("./data/training/conciseness_train.labels", names=['conciseness_label'])

data_train['clarity_label'] = clarity_train['clarity_label']
data_train['conciseness_label'] = conciseness_train['conciseness_label']

# Validation
data_validate = pd.read_csv("./data/validation/data_valid.csv", names=headers)

In [73]:
data_validate

Unnamed: 0,country,sku_id,title,category_lvl_1,category_lvl_2,category_lvl_3,short_description,price,product_type
0,my,AP564ELASSTWANMY,Apple MacBook Pro MGXC2ZP/A 16GB i7 15.4-inch ...,Computers & Laptops,Laptops,Macbooks,OS X Lion<br> Intel Core i7<br> 15-inch Retina...,12550.00,local
1,my,BR924HBAA5B3TLANMY,BRAND'S® American Ginseng Triple Pack (3x 6's)...,Health & Beauty,Food Supplements,Well Being,<ul> <li>Traditionally used to calm the mind a...,105.00,local
2,my,CA673ELAA5UG3XANMY,Canon EOS M10 Mirrorless Digital Camera 18MP w...,Cameras,Mirrorless,,<div> <ul> <li>18.0MP APS-C CMOS Sensor</li> <...,1588.00,local
3,my,DE759ELAA7QM1XANMY,"Dell LED Monitor 23"" (E2316H)",Computers & Laptops,Computer Accessories,Monitors,"<div class=""prod_content""> <div class=""prod_de...",565.00,local
4,my,ES802OTAABHAY8ANMY,Esprit Tallac Brave Nubuck Sand ES107601001 Be...,Watches Sunglasses Jewellery,Watches,Men,<ul> <li>stainless steel case</li> <li>mineral...,279.00,local
5,my,HP961ELAABF7N7ANMY,"(Refurbished) HP Compaq 3330 Pro MT + 19"" LCD",Computers & Laptops,Desktops Computers,All-purpose,"<ul> <li>Model : HP Compaq 3330 Pro MT + 19"" L...",1259.00,local
6,my,KI402HBAA5QI06ANMY,Kitsui Royal Lipo Trim,Health & Beauty,Food Supplements,Weight Management,<ul> <li>Lose Weight easily and painlessly</li...,38.80,local
7,my,MO554ELCCEXMANMY,Case for Apple iPhone 6 4.7 inch Dynamic Liqui...,Mobiles & Tablets,Accessories,Phone Cases,<ul> <li>Perfect and attractive decorate your ...,32.90,international
8,my,NO037ELAA35QHLANMY,New Car Shape USB 3D Optical Mouse Mice For PC...,Computers & Laptops,Computer Accessories,Mice,<ul> <li>High resolution: 800dpi Frequency for...,19.00,international
9,my,NO037ELAA5MMBPANMY,TPU + PC Combo Hybrid Rugged Dual Layer Grip C...,Mobiles & Tablets,Accessories,Phone Cases,<div> <ul> <li>Hybrid</li> <li>Kickstand</li> ...,36.00,international


In [72]:
def clean_title(title):
    title = title.lower()
    title = title.replace("/", " / ")
    title = re.sub(r"[^A-z0-9\s-]", "", title)

    return title

data_train['title_clean'] = data_train.title.apply(clean_title)
selected_label = ['title_clean', 'clarity_label', 'conciseness_label']
data_train = data_train[selected_label]

AttributeError: 'DataFrame' object has no attribute 'title'

In [68]:
data_train.head()

Unnamed: 0,title_clean,clarity_label,conciseness_label
0,adana gallery suri square hijab light pink,1,1
1,cuba heartbreaker eau de parfum spray 100ml 33oz,1,1
2,andoer 150cm cellphone smartphone mini dual-he...,1,0
3,anmyna complaint silky set shampoo 520ml con...,1,1
4,argital argiltubo green clay for face and body...,1,1


In [22]:
titles = data_train.title_clean.tolist()
sentences = [ models.doc2vec.LabeledSentence(
    words=s.split(" "), tags=[s.replace(" ", "_")]) for s in titles ]

In [30]:
model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, 
                              workers=4,alpha=0.025, min_alpha=0.025)

In [None]:
voca = model.build_vocab(sentences)

In [40]:
model.iter

5

In [41]:
for epoch in range(10):
    model.train(sentences, total_words=1000,epochs=model.iter)
    model.alpha -= 0.002  # decrease the learning rate`
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [42]:
model.save("model.doc2vec")


In [52]:
def vector_title(title, w2v_model):
    title = title.split(" ")
    vec = np.zeros(300)
    for w in title:
        try:
            vec += w2v_model.infer_vector(w)
        except:
            pass

    return vec

In [69]:
vector_title("xiaomi phone", model)
data_train['vector_title'] = data_train.title_clean.apply(lambda t: vector_title(t, model))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
data_validate['title_clean'] = data_validate.title.apply(clean_title)
data_validate['vector_title'] = data_validate.title_clean.apply(lambda t: vector_title(t, model))

In [70]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [75]:
X = data_train.vector_title.tolist()
y_clarity_label = data_train.clarity_label.tolist()
y_conciseness_label = data_train.conciseness_label.tolist()

data_validate_input = data_validate.vector_title.tolist()

In [76]:
clf_SVC_clarity = svm.SVC()
clf_SVC_conciseness = svm.SVC()

# clarity_label ===================
# X_train, X_test, y_train, y_test = train_test_split(X, y_clarity_label, test_size=0.1, random_state=42)
# clf_SVC_clarity.fit(X_train, y_train)
print "Fitting ..."
clf_SVC_clarity.fit(X, y_clarity_label)
# y_pred = clf_SVC_clarity.predict(X_test)
# print 'Accuracy clf_SVC_clarity = ', accuracy_score(y_test, y_pred)
# Validation
print "Predict ..."
validate_result_clarity = clf_SVC_clarity.predict(data_validate_input)
np.savetxt("submit/clarity_valid.predict", validate_result_clarity, "%.4f")

Fitting ...
Predict ...


In [77]:
# conciseness_label ===================
# X_train, X_test, y_train, y_test = train_test_split(X, y_conciseness_label, test_size=0.1, random_state=43)
# clf_SVC.fit(X_train, y_train)
# clf_SVC_clarity.fit(X_train, y_train)

print "Fitting ..."
clf_SVC_conciseness.fit(X, y_conciseness_label)
# y_pred = clf_SVC_conciseness.predict(X_test)
# print 'Accuracy clf_SVC_conciseness = ', accuracy_score(y_test, y_pred)

# Validation
print "Predict ..."
validate_result_conciseness = clf_SVC_conciseness.predict(data_validate_input)

np.savetxt("submit/conciseness_valid.predict", validate_result_conciseness, "%.4f")

Fitting ...
Predict ...


In [None]:
model.build_vocab(it)
