In [3]:
# algebra and dataframes
import numpy as np
import pandas as pd

# text processing
import re
import spacy
from gensim.models.phrases import Phrases, Phraser
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer


# hypothesis testing
from scipy.stats import gamma, kstest, lognorm, mannwhitneyu, ks_2samp
from scipy.sparse import hstack, vstack

# charts
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer, LabelEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline, make_union, FeatureUnion
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import train_test_split
import category_encoders as ce
from xgboost import XGBRegressor

# measurement metrics
from sklearn.metrics import classification_report, make_scorer, mean_squared_error, mean_squared_log_error

# charting paramemters
from pylab import rcParams
rcParams.update({'font.size' : 14, 'legend.fontsize' : "small", 
                 "xtick.labelsize" : 14, "ytick.labelsize" : 14, 
                 "figure.figsize":(9, 6), "axes.titlesize" : 20,
                 "axes.labelsize" : 14, "lines.linewidth" : 3, 
                 "lines.markersize" : 10
                })

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
seed = 20

In [187]:
data = pd.read_table('../data/train.tsv')
test = pd.read_table('../data/test.tsv')

In [188]:
quick = True

if quick:
    data = data.sample(frac=.1, random_state = seed)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148254 entries, 1065093 to 786483
Data columns (total 8 columns):
train_id             148254 non-null int64
name                 148254 non-null object
item_condition_id    148254 non-null int64
category_name        147588 non-null object
brand_name           84961 non-null object
price                148254 non-null float64
shipping             148254 non-null int64
item_description     148254 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 10.2+ MB


In [8]:
data.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1065093,1065093,Kylie Metal Matte Kymajesty,1,Beauty/Makeup/Lips,,18.0,1,Brand new in box - Never Swatched! Kylie's exo...
407370,407370,Joggers camouflage,2,Kids/Boys (4+)/Bottoms,Arizona,9.0,0,Camouflage jogger pants never worn at all boy'...
688213,688213,LuLaRoe Cassie & Irma Bundle - XL/Medium,1,"Women/Skirts/Straight, Pencil",LuLaRoe,54.0,1,This gorgeous Cassie has a solid black backgro...
155188,155188,Cute Sequins Top,3,Women/Tops & Blouses/Blouse,Maurices,6.0,1,Excellent condition except a lil highlighter m...
1309673,1309673,NEW NYX eyeshadow palette avant pop,1,Beauty/Makeup/Makeup Palettes,NYX,10.0,1,Brand new and sealed NYX eyeshadow palette in ...
565424,565424,Deadrising 3 and a 3 day Xbox live card,3,Vintage & Collectibles/Electronics/Video Game,Xbox,14.0,0,It's in a pretty good condition and used but w...
1043925,1043925,FULL ZIP HOODIE,1,Women/Sweaters/Full Zip,PINK,49.0,0,BRAND NEW VS PINK FULL ZIP HOODIE PRICE IS FIRM
1309082,1309082,9months boy,3,Kids/Boys 0-24 Mos/One-Pieces,Carter's,21.0,0,Baby boy 9m cloths . No stains
966563,966563,Black Nike pro spandex shorts size XL,1,Women/Athletic Apparel/Shorts,Nike,24.0,0,Brand new with tags Black Nike pro spandex sho...
1428151,1428151,Rae Dunn Faith Mug And Blessed Bowl,1,Home/Kitchen & Dining/Coffee & Tea Accessories,Rae Dunn,26.0,0,Rae Dunn Faith mug Blessed bowl bundle


In [9]:
tbl_price = data.groupby(['item_condition_id', 'shipping'])['price'].agg([np.mean, np.median, len])
tbl_price

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,len
item_condition_id,shipping,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,32.744244,23.0,28101.0
1,1,21.445607,14.0,35924.0
2,0,29.063278,19.0,23294.0
2,1,23.734701,15.0,14282.0
3,0,27.993743,18.0,28287.0
3,1,24.061085,15.0,14881.0
4,0,27.108725,16.0,2235.0
4,1,20.142433,12.0,1011.0
5,0,33.615894,21.0,151.0
5,1,31.113636,15.0,88.0


In [15]:
data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].groupby('item_condition_id')['price'].count()

item_condition_id
1    3095
2    1449
3    1312
4      61
5       1
Name: price, dtype: int64

In [16]:
data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].groupby('item_condition_id')['price'].agg([np.mean, np.median, len])

Unnamed: 0_level_0,mean,median,len
item_condition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,38.654766,35.0,3095.0
2,30.806073,25.0,1449.0
3,26.676067,20.0,1312.0
4,19.721311,14.0,61.0
5,4.0,4.0,1.0


In [11]:
def count_len(cell):
    try: 
        if np.isnan(cell):
            return 0
    except:
        return len(cell)

In [12]:
data['desc_lengths'] = [count_len(item) for item in data.item_description]

In [17]:
# translate should be fastest punctuation replacement
import string

s = data.item_description.iloc[55]+'!'
translator = str.maketrans('', '', string.punctuation)
s.translate(translator)

'Very durable and comfortable New'

In [19]:
translator = str.maketrans('', '', string.punctuation)

## Text prep

In [109]:
from textblob import Word

nlp = spacy.load('en', disable = ['ner'])

def regex_replace(texts, substitute = '', regex_pattern = r"[^a-zA-z' ]|'s" ):
    
    pattern = re.compile(regex_pattern)
    
    result = []
    for text in texts:
        replaced = pattern.sub(substitute, text)
        replaced = replaced.replace(r"\n", '').replace('  ', ' ').lower().strip()
        result.append(replaced)
    
    return result

def join_not(text):
    replace_these = re.findall(r'not\s+\w+', text)
    for item in replace_these:
        tmp = item.replace(' ', '_')
        text = text.replace(item, tmp)

    return text

def reg_replace(text):
        # remove HTML
    # text = BeautifulSoup(text, "html.parser").get_text()
    
    #removes remaining urls
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
       
    # removes numbers, symbols, nextline symbols 
    text = re.sub(r"[^a-zA-Z' ]"," ", text)   
    text = text.replace(r"\n", '').replace('  ', ' ').lower().strip('.')
    
    # removes some possessive, needs improvement
    text = re.sub(r"'s", "", text)

    return text

                
def clean_text(texts):
    
    texts = nlp.pipe(texts, batch_size = 1000)
    removals = {"x", "s", "v", "'s"}
    
    # removes punctuation and pronouns, random words, normalizes words by lemmatization
    word_lists=[]
    for sent in texts:
        words = []
        for word in sent:
            if (not word.is_punct and word.pos_ != 'PRON') and \
            not (word.pos_ == 'DET'  and word.is_stop):
                if word not in removals:
                    words.append(word.lemma_)
        word_lists.append(words)
                
    return word_lists

def clean_text2(texts):
    
    texts = nlp.pipe(texts, batch_size = 1000)
    removals = {"x", "s", "v", "'s"}
    
    # removes punctuation and pronouns, random words, normalizes words by lemmatization
    word_lists=[]
    for sent in texts:
        words = []
        for word in sent:
            if (not word.is_punct and word.pos_ != 'PRON') and \
            not (word.pos_ == 'DET'  and word.is_stop):
                if word not in removals:
                    words.append(Word(word.text).lemmatize('v'))
        word_lists.append(words)
                
    return word_lists

def blob_lemmatize(text):
    
    text = TextBlob(text)
    word_list = []
    for word in text.words:
        word_list.append(word.lemmatize('v'))
    return word_list

def remove_stops(unigrams, removals=['not'], join = False):
    
    # removes item from set
    if len(removals) > 0:
        for item in removals:
            try:
                spacy.lang.en.STOP_WORDS.remove(item)
            except Exception as error:
                #print(error)
                pass
    
    unigrams = [word for word in unigrams if word not in spacy.lang.en.STOP_WORDS]
    
    if join:
        return ' '.join(unigrams)
    else:
        return unigrams

In [32]:
'would' in spacy.lang.en.STOP_WORDS

True

In [54]:
import spacy
from spacy import displacy

displacy.render(test,style='dep', jupyter = True)

In [106]:
test = nlp(data.item_description.iloc[5])
for token in test:
    print(token.orth_, token.pos_, token.lemma_, token.tag_, token.is_stop, token.is_punct, token.text)

It PRON -PRON- PRP False False It
's VERB be VBZ False False 's
in ADP in IN True False in
a DET a DT True False a
pretty ADV pretty RB False False pretty
good ADJ good JJ False False good
condition NOUN condition NN False False condition
and CCONJ and CC True False and
used VERB use VBN True False used
but CCONJ but CC True False but
works VERB work VBZ False False works
really ADV really RB True False really
well ADV well RB True False well
! PUNCT ! . False True !
For ADP for IN False False For
Xbox PROPN xbox NNP False False Xbox
one NUM one CD True False one


In [153]:
data.isnull().sum()

train_id                   0
name                       0
item_condition_id          0
category_name            346
brand_name             31692
price                      0
shipping                   0
item_description           0
desc_lengths               0
parsed_descriptions        0
dtype: int64

In [162]:

combined_text = data.item_description + data.category_name.fillna('unknown') + data.brand_name.fillna('unknown') + data.name.fillna('unknown')

In [169]:
%%time
desc_list = regex_replace(combined_text)
parsed_text = clean_text(desc_list)

CPU times: user 24min 26s, sys: 38min 4s, total: 1h 2min 30s
Wall time: 6min


In [83]:

# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')

  return f(*args, **kwds)


In [112]:
%%time
descs = clean_text(desc_list)

CPU times: user 20min 36s, sys: 31min 51s, total: 52min 28s
Wall time: 4min 42s


In [113]:
%%time
#desc_list = regex_replace(data.item_description)
item_descriptions = clean_text2(desc_list)

CPU times: user 20min 54s, sys: 32min 14s, total: 53min 8s
Wall time: 5min 1s


In [177]:
%%time
phrase_model = Phraser(Phrases(item_descriptions))
parsed_bigrams = [' '.join(phrase_model[item_descriptions[i]]) for i in range(0,len(item_descriptions))]
description_matrix = vectorizer.fit_transform(parsed_bigrams)

CPU times: user 22.7 s, sys: 64 ms, total: 22.8 s
Wall time: 22.8 s


In [184]:
%%time
seed = 20
nmf = NMF(n_components = 40, random_state = seed)
#description_W = nmf.fit_transform(description_matrix)

svd = TruncatedSVD(n_components = 40, random_state = seed)
#description_svd = svd.fit_transform(description_matrix)

vectorizer = TfidfVectorizer( max_df = 0.99, min_df = 10, stop_words = 'english')
vectorizer2 = TfidfVectorizer(max_df = 0.2, min_df = 10, stop_words = 'english')


CPU times: user 381 µs, sys: 0 ns, total: 381 µs
Wall time: 304 µs


# Pipes

In [None]:
func = make_pipeline(FunctionTransformer(func = column_select, validate = False), vectorizer)

In [161]:
Pipeline([('vec', vectorizer), ('nmf', nmf)])

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=10,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...iter=200,
  n_components=20, random_state=20, shuffle=False, solver='cd', tol=0.0001,
  verbose=0))])

In [168]:
# method1
pipe1 = FeatureUnion(transformer_list = 
    [ 
        
    ('nmf_pipe', Pipeline([('vec', vectorizer), ('nmf', nmf)])),
    ('svd_pipe', Pipeline([('vec', vectorizer), ('svd', svd)]))
    
    ])

In [181]:
data['parsed_descriptions'] = parsed_bigrams
cols = [item for item in data.columns.values if item not in ['price', 'names', 'train_id']]

## Custom Classes

In [175]:
from sklearn.base import BaseEstimator, TransformerMixin

# from sklearn's tutorial
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class categorical_means(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        return None
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None, col_name = 'category_name', response_var = 'price'):
#         stuff = cat_means(X)
        return self.cat_means(X, col_name, response_var)
#         return stuff
    
    def tmp(self, data):
        return data
    
    def cat_means(self, data, col_name, response_var):
    
        grouped_data = data.groupby(col_name)[response_var].agg([len, np.mean, np.median])
        conversion_dict = {name:mean for name, mean in zip(grouped_data[grouped_data.len > 30].index, 
                                                       grouped_data[grouped_data.len > 30]['mean'])}
        converted = [conversion_dict[name] if name in conversion_dict else -1 for name in data[col_name]]

        return converted

In [227]:
from sklearn.model_selection import cross_val_score
import time

def rmsle(y, pred_y):
    
    pred_y = np.fmax(pred_y, 0)
    rmsle = np.sqrt(np.mean(np.power(np.log(1 + pred_y) - np.log(1 + y),2)))
    return rmsle

def function(params = {'objective': 'reg:gamma'}):
    
    
    print(params)
    print('start time %s ' % time.ctime())
    
    latent_pipe = make_pipeline(
                        ItemSelector('item_description'), vectorizer, 
                                FeatureUnion(transformer_list = 
                                    [ 
                                        ('nmf_pipe',  nmf),
                                        ('svd_pipe',  svd)
                                    ]
                                    )
                                )

    
    feature_pipe = FeatureUnion(transformer_list = 
                [ 
                    ('nmf_svd', latent_pipe),
                    ('shipping + condition', make_pipeline(ItemSelector(['shipping', 'item_condition_id']))),    
                    ('target_encode', make_pipeline(ItemSelector(['category_name', 'brand_name', 'cat1', 'cat2', 'cat3']), 
                                                    ce.target_encoder.TargetEncoder(return_df = False, 
                                                                                    min_samples_leaf = 5)))

                ])

    final_pipe = make_pipeline(feature_pipe, StandardScaler(), XGBRegressor(**params))

    cv_score = cross_val_score(final_pipe, data, data['price'], cv = 5, scoring = make_scorer(rmsle))
    
    # displays time for optimization cycle purposes
    print('end time %s: ' % time.ctime())
    
    
    print(np.mean(cv_score))
    
    return np.mean(cv_score)

def function_log(params = {'objective': 'reg:gamma'}):
    
    
    print(params)
    print('start time %s ' % time.ctime())
    
    latent_pipe = make_pipeline(
                        ItemSelector('item_description'), vectorizer, 
                                FeatureUnion(transformer_list = 
                                    [ 
                                        ('nmf_pipe',  nmf),
                                        ('svd_pipe',  svd)
                                    ]
                                    )
                                )

    
    feature_pipe = FeatureUnion(transformer_list = 
                [ 
                    ('nmf_svd', latent_pipe),
                    ('shipping + condition', make_pipeline(ItemSelector(['shipping', 'item_condition_id']))),    
                    ('target_encode', make_pipeline(ItemSelector(['category_name', 'brand_name', 'cat1', 'cat2', 'cat3']), 
                                                    ce.target_encoder.TargetEncoder(return_df = False, 
                                                                                    min_samples_leaf = 26)))

                ])

    final_pipe = make_pipeline(feature_pipe, StandardScaler(), XGBRegressor(**params))

    cv_score = cross_val_score(final_pipe, data, np.log1p(data['price']), cv = 4, scoring = 'neg_mean_squared_error')
    
    # displays time for optimization cycle purposes
    print('end time %s: ' % time.ctime())
    
    
    print (np.mean(np.sqrt(-cv_score)))
    
    return np.mean(np.sqrt(-cv_score))

In [222]:

cat_splits = data.category_name.str.split('/')
cat1 = []
cat2 = []
cat3 = []
for item in cat_splits:
    if item is np.nan:
        cat1.append('none')
        cat2.append('none')
        cat3.append('none')
    else:
        cat1.append(item[0])
        cat2.append(item[1])
        cat3.append(item[2])

In [217]:
data['cat1'] = cat1
data['cat2'] = cat2
data['cat3'] = cat3

In [115]:
data.item_description = data.item_description.fillna('unknown')

In [228]:
%%time 
result = function_log()

{'objective': 'reg:gamma'}
start time Mon Aug 20 15:12:17 2018 
end time Mon Aug 20 15:23:59 2018: 
0.5721938206009908
CPU times: user 12min 13s, sys: 1min 46s, total: 14min
Wall time: 11min 41s


In [225]:
result

0.5677420188311832

In [144]:
old = {
        'max_depth': hp.choice('max_depth', np.arange(10, 30, dtype=int)),
        'min_child_weight': hp.quniform ('min_child', 1, 20, 1),
        'subsample': hp.uniform ('subsample', 0.8, 1),
        'n_estimators' : hp.choice('n_estimators', np.arange(1000, 10000, 100, dtype=int)),
        'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        'gamma' : hp.quniform('gamma', 0.1, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05)
}

In [197]:
from hyperopt import fmin, tpe, hp

best = fmin(
    
    fn = function_log,
    
space = {
        'objective': 'reg:gamma',
        'n_estimators': hp.choice('n_estimators', np.arange(100, 300, 50)),
        'learning_rate': hp.quniform('learning_rate', 0.1, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(2, 10, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'gamma': hp.quniform('gamma', 0, 1, 0.05),
        'n_jobs': 12,
        'random_state': 42
        },
    
    algo = tpe.suggest,
    max_evals = 10
           )



{'gamma': 0.05, 'learning_rate': 0.30000000000000004, 'max_depth': 9, 'min_child_weight': 5.0, 'n_estimators': 150, 'n_jobs': 12, 'objective': 'reg:gamma', 'random_state': 42}
start time Mon Aug 20 14:08:53 2018 
end time Mon Aug 20 14:21:54 2018: 
0.5722013447893856
{'gamma': 0.9, 'learning_rate': 0.35000000000000003, 'max_depth': 7, 'min_child_weight': 5.0, 'n_estimators': 100, 'n_jobs': 12, 'objective': 'reg:gamma', 'random_state': 42}
start time Mon Aug 20 14:21:54 2018 


KeyboardInterrupt: 

In [360]:
best

{'gamma': 0.30000000000000004,
 'learning_rate': 0.125,
 'max_depth': 6,
 'min_child_weight': 5.0,
 'n_estimators': 0}

In [143]:
space={
        'n_estimators': hp.choice('n_estimators', np.arange(100, 300, 50)),
        'learning_rate': hp.quniform('learning_rate', 0.1, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(2, 10, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'gamma': hp.quniform('gamma', 0, 1, 0.05),
        'n_jobs': 12,
        'random_state': 42
    }


from hyperopt import space_eval
best_params = space_eval(space, best)

NameError: name 'best' is not defined

In [268]:
np.sqrt(np.mean(-cv_score))

35.875792741605146

In [241]:
np.sqrt(np.mean(-cv_score))

36.33422351942155

In [204]:
data.shape

(148254, 9)

In [125]:
pipe1.fit(parsed_bigrams)

FeatureUnion(n_jobs=12,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=10,
    ...runcatedSVD(algorithm='randomized', n_components=20, n_iter=5,
       random_state=20, tol=0.0))]))],
       transformer_weights=None)

In [176]:
ItemSelector('item_description').transform(data)

1065093    Brand new in box - Never Swatched! Kylie's exo...
407370     Camouflage jogger pants never worn at all boy'...
688213     This gorgeous Cassie has a solid black backgro...
155188     Excellent condition except a lil highlighter m...
1309673    Brand new and sealed NYX eyeshadow palette in ...
565424     It's in a pretty good condition and used but w...
1043925      BRAND NEW VS PINK FULL ZIP HOODIE PRICE IS FIRM
1309082                       Baby boy 9m cloths . No stains
966563     Brand new with tags Black Nike pro spandex sho...
1428151               Rae Dunn Faith mug Blessed bowl bundle
68447      Features: 100% Brand New. Material: Neoprene a...
91265      Fenty flip flop! Super cozy, awesome color! Co...
77999                                   Green Day band shirt
975978     From ipsy bag new never used never swatched it...
1066167    NWOT 2PC VS PINK V-NECK LS TEE AND PINK NATION...
524248     Funko Pop! Jimi Hendrix exclusive figure with ...
1151062    ☆PERFECT MOTH

In [126]:
result = pipe1.transform(parsed_bigrams)

In [129]:
result.shape

(148254, 40)

In [140]:
final_pipe = make_pipeline(pipe1, xgb.XGBRegressor())

In [7]:
from sklearn.pipeline import FeatureUnion, Pipeline

In [146]:
#make_union(make_pipeline(vectorizer, nmf), validate=False)
FeatureUnion([pipe1, xgb.XGBRegressor])

TypeError: zip argument #1 must support iteration

In [None]:
description_svd.shape

In [None]:
type(description_matrix)

In [None]:
description_W.shape

In [None]:
data.shape

In [None]:
trn_idx, test_idx = train_test_split(range(len(data)), test_size = 0.3, random_state = seed)

## Custom classes

In [None]:
cat = categorical_means()

In [None]:
cat.transform(data, col_name = 'brand_name', response_var = 'price')

In [None]:
make_pipeline(cat)

In [None]:
data['category_means'] = cat_group_means(data)

In [None]:

grouped_data = data.groupby('brand_name')['price'].agg([len, np.mean, np.median])

In [None]:
cat_group_means(data, col_name = 'brand_name')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
pd.concat([pd.DataFrame(description_W), pd.DataFrame(description_svd), data.reset_index(drop=True)], axis = 1)

In [None]:
pd.concat([pd.DataFrame(description_W), pd.DataFrame(description_svd), data.reset_index(drop=True)], axis = 1).to_csv('../processed/converted_array.csv')

In [None]:
data.info()

In [None]:
np.isnan('arg')

In [None]:
clean_text(np.nan)

In [None]:

data['cleaned_brands'] = [clean_text(item) for item in data.brand_name.fillna('unknown')]

In [None]:
bigram_dict = {}
for item, score in phrases:
    bigram_dict[item] = score

In [None]:
bigram_dict

## Entity recognition example

In [None]:
doc = """The White House climb-down from President Donald Trump’s disastrous news conference with his Russian counterpart Vladimir Putin began Monday night, just hours after Trump said he saw “no reason” why Russia would have meddled in the 2016 election.

With even The Wall Street Journal’s editorial board — normally intensely loyal to Trump — joining in widespread criticism of the president’s implicit public rejection of U.S. intelligence claims, the White House circulated talking points to supporters saying that Trump still had great confidence in his intelligence agencies and that he believed their assessment that the Kremlin actively influenced the vote.

But the president himself emerged on Tuesday to personally walk back his statements in Helsinki, using a scheduled meeting with members of Congress to discuss tax reform as a platform for revising the statements that set off a 24-hour firestorm.

“In a key sentence in my remarks, I said the word ‘would’ instead of ‘wouldn’t,’” Trump said. “The sentence should have been — and I thought it would be maybe a little bit unclear on the transcript or unclear on the actual video — the sentence should have been: I don’t see any reason why it wouldn't be Russia. Sort of a double negative.”

"""

In [None]:
doc = nlp(doc)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
sns.distplot(data.desc_lengths)

In [None]:
sns.distplot(sample)

In [None]:
?lognorm

In [None]:
# fit a gamma dist
fit_alpha, fit_loc, fit_beta=gamma.fit(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)
print(fit_alpha, fit_loc, fit_beta)

In [None]:
gamma.fit(list(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price))

In [None]:
lognorm._fitstart(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)

In [None]:
lognorm.fit(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)

In [None]:
?lognorm.fit

In [None]:
?kstest

In [None]:
ks_2samp(np.array(data.price[data.brand_name.isnull()]), np.array(data.price[data.brand_name.notnull()]))

In [None]:
# non-parametric indepedent samples
mannwhitneyu(data.price[data.brand_name.isnull()], 
                       data.price[data.brand_name.notnull()])

In [None]:
sns.distplot(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price, bins = 100)
plt.xlim(0, 200)

In [None]:
pd.pivot_table(data, values = 'price', index = 'brand_name', columns = 'shipping', aggfunc=(np.median, len)).columns

In [None]:
# tuple to select multi-level index
pd.pivot_table(data, values = 'price', index = 'brand_name', columns = 'shipping', aggfunc=(np.median, len))[('len', 0)]

In [None]:
brand_tbl = pd.pivot_table(data, values = 'price', index = 'brand_name', aggfunc=('mean', np.median, len), dropna=False).sort_values('len', ascending=False)
brand_tbl[brand_tbl.len > 30].sort_values('median', ascending=False)

In [None]:
data.brand_name.isnull().sum()

In [None]:
data.brand_name.value_counts(dropna=False, normalize=True)

In [None]:
null_proportion = data.brand_name.value_counts(dropna=False, normalize=True)
1- data.brand_name.value_counts(dropna=False, normalize=True)[0]

In [None]:
data['brand_name_null'] = data.brand_name.isnull()*1
sns.kdeplot(data.price[data.brand_name.notnull()])

In [None]:


fig, ax = plt.subplots(sharex=True, sharey=True)

for data, label in zip([data.price[data.brand_name.isnull()], 
                       data.price[data.brand_name.notnull()]], 
                      ["0", "1"]):
     sns.kdeplot(data,  ax=ax, label = label, shade = True)
     #sns.distplot(data,  ax=ax, label = label)
   
#ax.set_xlim([-5, 250])

plt.show()

In [None]:
data.price.describe()

In [None]:
np.median(data.price)

In [None]:
pd.pivot_table(data.fillna(-1), values = 'price', index = 'brand_name', aggfunc=(np.median, len), dropna=False).sort_values('len', ascending=False)

In [None]:
# picks categorical columns, check for diff datasets
def get_categoricals(data):

    categorical_columns = []
    for dtype, idx in zip(data.dtypes, data.dtypes.index):
        if dtype == 'object':
            categorical_columns.append(idx)
    return categorical_columns

def encode_column(train_column, test_column, fillna = False):
    
    le = LabelEncoder()
    
    # le does not work with nan
    train_column = train_column.fillna('unknown')
    test_column = test_column.fillna('unknown')
    le.fit(train_column)
    train_column_le = le.transform(train_column)
    test_column_le = le.transform(test_column)
    translation_dict = {i:item for i, item in enumerate(le.classes_)}
    return train_column_le, test_column_le, translation_dict

# categorical to column arrays wide
def label_convert(train_column, test_column):
    
    encoder = LabelBinarizer()
    label_model = encoder.fit(train_column.fillna('unknown'))
    converted_train = label_model.transform(train_column.fillna('unknown'))
    converted_test = label_model.transform(test_column.fillna('unknown'))
    translation_dict = {i:item for i, item in enumerate(label_model.classes_)}
    return converted_train, converted_test, translation_dict

# keep cats in one column
def label_convert_cols(train, test):
    cols = get_categoricals(train)
    translation_dicts = {}
    train_column_stacks = []
    test_column_stacks = []
    for item in cols:
        train_column_labels, test_column_labels, translation_dict = encode_column(train[item], test[item])
        train_column_stacks.append(train_column_labels)
        test_column_stacks.append(test_column_labels)
        translation_dicts[item] = translation_dict
    train_arrays = np.vstack(train_column_stacks).T
    test_arrays = np.vstack(test_column_stacks).T
    return translation_dicts, train_arrays, test_arrays 

# converts all categorical columns into sparse format
def convert_columns(train, test, cols):
    
    translation_dicts = {}
    converted_train_arrays = []
    converted_test_arrays = []
    for column in cols:
        converted_train, converted_test, translation_dict = label_convert(train[column], test[column])
        translation_dicts[column] = translation_dict
        converted_train_arrays.append(converted_train)
        converted_test_arrays.append(converted_test)
    stacked_train_arrays = np.hstack(converted_train_arrays)
    stacked_test_arrays = np.hstack(converted_test_arrays)
    return translation_dicts, stacked_train_arrays, stacked_test_arrays

def process_categoricals(train, test):

    columns = get_categoricals(train)
    category_dicts, train_arrays, test_arrays = convert_columns(train, test, columns)
    return category_dicts, train_arrays, test_arrays

def combine_num_cats(train, test, wide=True):

    # combines numerical and categorical columns    
    
    column_dict = {}
    cols = get_categoricals(train)
    num_cols = [item for item in train.columns if item not in cols]
    for item in num_cols:
        column_dict[item]='numerical_column'
    if wide:    
        category_dicts, train_arrays, test_arrays = process_categoricals(train, test)
    else:
        category_dicts, train_arrays, test_arrays = label_convert_cols(train, test)

    # ** unpacks dict and allows extending dict
    combined_dict = {**column_dict, **category_dicts}
    train_matrix = np.hstack([train[num_cols].values, train_arrays])
    test_matrix = np.hstack([test[num_cols].values, test_arrays])
    
    return combined_dict, train_matrix, test_matrix


# needs fix for columns of 0 or 1 like sex# needs f 
def flatten_cols(column_dict):
    
    flattened_columns = {}
    f = 0
    
    for item in column_dict.items():    
        if type(item[1]) is dict:
            if len(item[1]) > 2:
                for name in item[1].values():
                    flattened_columns.update({f:item[0] + '_' + name})
                    f = f + 1
            else:
                flattened_columns.update({f:item[0]+ '_' + list(item[1].values())[1]})
                f = f + 1
        else:
            flattened_columns.update({f:item[0]})
            f = f + 1
    return flattened_columns

# def encode_column(column, fillna = False):
    
#     le = LabelEncoder()
    
#     # le does not work with nan, replace with string unknown
#     column = column.fillna('unknown')
#     le.fit(column)
#     column_le = le.transform(column)
    
#     if fillna == False:
#         # replace unknown with nan, nan must replace float value
#         column_le = column_le.astype('float')
#         idx = [i for i, value in enumerate(column_le) if column_le[i]=='unknown']
#         column_le[idx] = np.nan

#     zip_obj = zip(list(column), column_le)
#     return column_le, dict(zip_obj)

In [59]:
data.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          1607
brand_name           158259
price                     0
shipping                  0
item_description          1
dtype: int64

In [None]:
brand_name_miss = {name for name in data.brand_name if name not in test.brand_name}

In [None]:
len(brand_name_miss)

In [None]:
brands_le, brand_dict = encode_column(data.brand_name, test.brand_name, fillna = False)
category_le, category_dict = encode_column(data.category_name, test.category_name, fillna = False)

In [None]:
data['brands_le'] = brands_le
data['category_le'] = category_le

In [None]:
data_x = data[['item_condition_id', 'category_le', 'brands_le', 'shipping']].as_matrix()
data_y = data.price.values

In [None]:
np.array(data.price)

In [None]:
?xgb.DMatrix

In [None]:
np.isnan(data_x).sum()

In [None]:
np.any(1)

In [None]:
def log_fix(array):
    
    tmp = np.zeros(len(array))
    
    for i, number in enumerate(array):
        if not np.any(number):
            tmp[i] = 0
        else:
            tmp[i] = np.log(number)
    return tmp        
        

In [None]:
data_y_ln = log_fix(data_y)

In [None]:
data_y_ln[0:10]

In [None]:
# dataing data splitting function
# def data_split(data, proportion = .2):
#     cutoff = round(len(data) * (1-proportion))
#     return data[0:cutoff], data[cutoff:]

train_x, val_x = train_test_split(data_x, data_y, test_size = .3, random_state = 42)
train_y, val_y = data_split(data_x, data_y_ln, test_size = .3)


In [None]:
train_x = xgb.DMatrix(train_x, train_y)
train_ln = xgb.DMatrix(train_x, train_y_ln)

In [None]:
np.count_nonzero(~np.isnan(data_x))


In [None]:
params = {'min_child_weight': 20, 'max_depth': 7,
            'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'auto'}

model = xgb.data(params, ddata, 100)



In [None]:
model.get_score(importance_type='gain').values()

In [None]:
data

In [None]:
sns.barplot(x=list(model.get_score(importance_type='gain').keys()), y=list(model.get_score(importance_type='gain').values()))

In [None]:
pd.DataFrame(data={'a':[2,3]})

In [None]:
model_ln=xgb.data(params, ddata_ln, 100)

In [None]:
??xgb.data

In [None]:
dtest = xgb.DMatrix(data_x[0:10000])
true = data_y[0:10000].astype(float)

In [None]:
help(model)

In [None]:
# for linear non-transformed
print(mean_squared_error(val_data_y, model.predict(val_data_x)))

print(mean_squared_error(true, np.exp(model_ln.predict(dtest))))

In [None]:
model.predict(dtest)