In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import regexp_tokenize
from nltk.stem.porter import PorterStemmer
import time
from scipy.sparse import csr_matrix

In [2]:
start_time = time.time()
train = pd.read_csv("input/train.tsv", sep='\t')
train = train[:10000]
test = pd.read_csv("input/test.tsv", sep='\t')
test = test[:10000]
print('[{}] Finished to load train/test'.format(time.time() - start_time))

[19.159508228302002] Finished to load train/test


In [3]:
def clean_missing(df):
    df["category_name"] = df["category_name"].replace(np.NaN, "")
    df["brand_name"] = df["brand_name"].replace(np.NaN, "")
    df["item_description"] = df["item_description"].replace(np.NaN, "")
    df["item_description"] = df["item_description"].replace(np.NaN, "")
    return df

start_time = time.time()
train = clean_missing(train)
test = clean_missing(test)

print('[{}] Finished to replace NaN'.format(time.time() - start_time))


[0.1779642105102539] Finished to replace NaN


In [4]:
def split_cat(text):
    if text.count('/') > 1:
        return text.split("/")
    else:
        return (["No Label", "No Label", "No Label"])

def transform_category_name(df):
    df['general_cat'], df['subcat_1'], df['subcat_2'] = \
    zip(*df['category_name'].apply(lambda x: split_cat(x)))
    return df

start_time = time.time()
train = transform_category_name(train)
test = transform_category_name(test)

print('[{}] Finished to transform category_name'.format(time.time() - start_time))


[0.11397743225097656] Finished to transform category_name


In [5]:
stop_words = []
def stem_tokenize(text, stop_words=stop_words):
    stemmer = PorterStemmer()
    tokens = regexp_tokenize(text, pattern=r"[A-Za-z]\w+")
    tokens_wo_sw = [x for x in tokens if x not in stop_words and len(x) > 3]
    tokens_stemmed = [stemmer.stem(x) for x in tokens_wo_sw]
    return tokens_stemmed

# category_name :
tfidf_vectorizer = TfidfVectorizer(tokenizer=stem_tokenize, decode_error='ignore', strip_accents='unicode', max_df=0.95, min_df=0.01)
tfidf_vectorizer.fit(train["category_name"])


TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function stem_tokenize at 0x000000004F4DB950>,
        use_idf=True, vocabulary=None)

In [8]:
X1_sparse = tfidf_vectorizer.transform(train["category_name"])
np.savetxt("x1_sparse.csv", X1_sparse, delimiter=",")

IndexError: tuple index out of range

In [9]:
X1_sparse

<10000x76 sparse matrix of type '<class 'numpy.float64'>'
	with 30178 stored elements in Compressed Sparse Row format>

In [13]:
X1_dense = X1_sparse.todense()

In [10]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [11]:
save_sparse_csr("X1_sparse.csv",X1_sparse)

In [14]:
np.savetxt("X1_dense.csv", X1_dense, delimiter=",")

In [13]:
X1_array = X1_sparse.toarray()

In [16]:
type(X1_array)

numpy.ndarray

In [17]:
type(X1_dense)

numpy.matrixlib.defmatrix.matrix

In [None]:


X11 = tfidf_vectorizer.transform(test["category_name"]).todense()
np.savetxt("X1.csv", X1, delimiter=",")
np.savetxt("X11.csv", X11, delimiter=",")

In [21]:
 X4 = pd.get_dummies(train[['item_condition_id', 'shipping']]).as_matrix()

In [23]:
type(X4)

numpy.ndarray

In [24]:
 X_train = np.concatenate((X1_array, X4), axis=1)

In [25]:
X_train.shape

(1000, 81)

In [None]:





    X44 =pd.get_dummies(test[['item_condition_id', 'shipping']]).toarray()
    np.savetxt("X4.csv", X4, delimiter=",")
    np.savetxt("X44.csv", X44, delimiter=",")
    print('[{}] Finished to create X4'.format(time.time() - start_time))

