In [2]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier

import pandas as pd
import numpy as np
import json

In [3]:
def create_best_prediction(row):
    if row['cat_sub_p1'] not in cat_sub_names:
        return row['cat_sub_p2']
    else:
        return row['cat_sub_p1']

In [4]:
alpha_df = pd.read_csv('../Datos/Generados/05train.csv', index_col=False)
beta_df = pd.read_csv('../Datos/Generados/05test.csv', index_col=False)

In [5]:
alpha_df.head()

Unnamed: 0,description,cat_sub,category,subcategory,desc
0,sandalia havaianas top disney amar banana 35/36,21_22,21,22,sandaliahavaianas havaianastop sandaliatop san...
1,sandalia havaianas top max cinza aco 45/46,21_22,21,22,sandaliahavaianas havaianastop sandaliatop san...
2,sand havaianas sl org 33/34 areia,21_22,21,22,sandhavaianas havaianassl sandsl sandareia san...
3,sand (f) havaianas t max st 41/42 bc/mar,21_22,21,22,sandfemale femalehavaianas sandhavaianas sandm...
4,sandalia havaianas surf bg/pt 39x40,21_22,21,22,sandaliahavaianas havaianassurf sandaliasurf s...


In [6]:
# Assign dataframes division
train_df = alpha_df
test_df = beta_df

In [7]:
# Count and TFIDF
cv = CountVectorizer()
cv_fit = cv.fit_transform(train_df['desc'])

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(cv_fit)

In [8]:
X_train_tfidf.shape

(10358, 39890)

In [9]:
cv_fit.shape

(10358, 39890)

In [10]:
# Training
clf = LinearSVC(loss='squared_hinge', C=10, random_state=42, max_iter=100000)
model_cat = clf.fit(X_train_tfidf, train_df['category'])

clf = LinearSVC(loss='squared_hinge', C=10, random_state=42, max_iter=100000)
model_sub = clf.fit(X_train_tfidf, train_df['subcategory'])

clf = LinearSVC(loss='squared_hinge', C=10, random_state=42, max_iter=100000)
model_cat_sub = clf.fit(X_train_tfidf, train_df['cat_sub'])

In [11]:
# Apply model on test dataset
docs_new = test_df['desc']
X_new_counts = cv.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [12]:
# Predict
predicted_cat = model_cat.predict(X_new_tfidf)
predicted_sub = model_sub.predict(X_new_tfidf)
predicted_cat_sub = model_cat_sub.predict(X_new_tfidf)

In [13]:
# Add columns
test_df['cat_p'] = predicted_cat
test_df['sub_p'] = predicted_sub
test_df['cat_sub_p1'] = test_df['cat_p'].astype(str) + '_' + test_df['sub_p'].astype(str)
test_df['cat_sub_p2'] = predicted_cat_sub

In [14]:
# Print results
print('CATEGORIA Y SUBCATEGORIA:', np.mean(test_df['cat_sub_p1'] == test_df['cat_sub'])) # Both
print('CATEGORIA Y SUBCATEGORIA [*]:', np.mean(test_df['cat_sub_p2'] == test_df['cat_sub'])) # Simple
print('CATEGORIA:', np.mean(predicted_cat == test_df['category']))
print('SUBCATEGORIA:', np.mean(predicted_sub == test_df['subcategory']))

CATEGORIA Y SUBCATEGORIA: 0.7243155093983349
CATEGORIA Y SUBCATEGORIA [*]: 0.7193373959316797
CATEGORIA: 0.9100506394300918
SUBCATEGORIA: 0.7558149515063085


In [15]:
with open('../Datos/Base/cat_sub_counter.json') as json_file:
    cat_sub_map = json.load(json_file)
    
cat_sub_names = list(cat_sub_map.keys())

In [16]:
test_df['cat_sub_p'] = test_df.apply(create_best_prediction, axis=1)

In [17]:
# Print results
print(np.mean(test_df['cat_sub_p'] == test_df['cat_sub'])) # Improved prediction
print(np.mean(test_df['cat_sub_p1'] == test_df['cat_sub'])) # Main prediction
print(np.mean(test_df['cat_sub_p2'] == test_df['cat_sub'])) # Second prediction

0.7341000772465883
0.7243155093983349
0.7193373959316797
