In [303]:
# algebra and dataframes
import numpy as np
import pandas as pd

# text processing
import re
import spacy
from gensim.models.phrases import Phrases, Phraser
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer


# hypothesis testing
from scipy.stats import gamma, kstest, lognorm, mannwhitneyu, ks_2samp
from scipy.sparse import hstack, vstack

# charts
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer, LabelEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import train_test_split
import xgboost as xgb

# measurement metrics
from sklearn.metrics import classification_report, make_scorer, mean_squared_error, mean_squared_log_error

# charting paramemters
from pylab import rcParams
rcParams.update({'font.size' : 14, 'legend.fontsize' : "small", 
                 "xtick.labelsize" : 14, "ytick.labelsize" : 14, 
                 "figure.figsize":(9, 6), "axes.titlesize" : 20,
                 "axes.labelsize" : 14, "lines.linewidth" : 3, 
                 "lines.markersize" : 10
                })

In [7]:
seed = 20

In [8]:
data = pd.read_table('../data/train.tsv')
test = pd.read_table('../data/test.tsv')

In [9]:
quick = True

if quick:
    data = data.sample(frac=.1, random_state = seed)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148254 entries, 1065093 to 786483
Data columns (total 8 columns):
train_id             148254 non-null int64
name                 148254 non-null object
item_condition_id    148254 non-null int64
category_name        147588 non-null object
brand_name           84961 non-null object
price                148254 non-null float64
shipping             148254 non-null int64
item_description     148254 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 10.2+ MB


In [11]:
data.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1065093,1065093,Kylie Metal Matte Kymajesty,1,Beauty/Makeup/Lips,,18.0,1,Brand new in box - Never Swatched! Kylie's exo...
407370,407370,Joggers camouflage,2,Kids/Boys (4+)/Bottoms,Arizona,9.0,0,Camouflage jogger pants never worn at all boy'...
688213,688213,LuLaRoe Cassie & Irma Bundle - XL/Medium,1,"Women/Skirts/Straight, Pencil",LuLaRoe,54.0,1,This gorgeous Cassie has a solid black backgro...
155188,155188,Cute Sequins Top,3,Women/Tops & Blouses/Blouse,Maurices,6.0,1,Excellent condition except a lil highlighter m...
1309673,1309673,NEW NYX eyeshadow palette avant pop,1,Beauty/Makeup/Makeup Palettes,NYX,10.0,1,Brand new and sealed NYX eyeshadow palette in ...
565424,565424,Deadrising 3 and a 3 day Xbox live card,3,Vintage & Collectibles/Electronics/Video Game,Xbox,14.0,0,It's in a pretty good condition and used but w...
1043925,1043925,FULL ZIP HOODIE,1,Women/Sweaters/Full Zip,PINK,49.0,0,BRAND NEW VS PINK FULL ZIP HOODIE PRICE IS FIRM
1309082,1309082,9months boy,3,Kids/Boys 0-24 Mos/One-Pieces,Carter's,21.0,0,Baby boy 9m cloths . No stains
966563,966563,Black Nike pro spandex shorts size XL,1,Women/Athletic Apparel/Shorts,Nike,24.0,0,Brand new with tags Black Nike pro spandex sho...
1428151,1428151,Rae Dunn Faith Mug And Blessed Bowl,1,Home/Kitchen & Dining/Coffee & Tea Accessories,Rae Dunn,26.0,0,Rae Dunn Faith mug Blessed bowl bundle


In [12]:
tbl_price = data.groupby(['item_condition_id', 'shipping'])['price'].agg([np.mean, np.median, len])
tbl_price

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,len
item_condition_id,shipping,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,32.744244,23.0,28101.0
1,1,21.445607,14.0,35924.0
2,0,29.063278,19.0,23294.0
2,1,23.734701,15.0,14282.0
3,0,27.993743,18.0,28287.0
3,1,24.061085,15.0,14881.0
4,0,27.108725,16.0,2235.0
4,1,20.142433,12.0,1011.0
5,0,33.615894,21.0,151.0
5,1,31.113636,15.0,88.0


In [13]:
(data.category_name.value_counts() > 1).sum()

904

In [14]:
tbl_price.columns

Index(['mean', 'median', 'len'], dtype='object')

In [15]:
tbl_price.index

MultiIndex(levels=[[1, 2, 3, 4, 5], [0, 1]],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
           names=['item_condition_id', 'shipping'])

In [16]:
tbl_price.loc[(3,0), :]

mean         27.993743
median       18.000000
len       28287.000000
Name: (3, 0), dtype: float64

In [17]:
data.fillna(-1).groupby('category_name')['price', 'item_condition_id'].agg([np.mean, np.median, len]).sort_values(by=('price', 'len'), ascending=False)

Unnamed: 0_level_0,price,price,price,item_condition_id,item_condition_id,item_condition_id
Unnamed: 0_level_1,mean,median,len,mean,median,len
category_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"Women/Athletic Apparel/Pants, Tights, Leggings",33.876394,29.0,5918.0,1.719838,1.0,5918
Women/Tops & Blouses/T-Shirts,19.362634,16.0,4579.0,2.070103,2.0,4579
Beauty/Makeup/Face,19.018873,15.0,3444.0,1.577526,1.0,3444
Beauty/Makeup/Lips,18.425453,14.0,2978.0,1.294829,1.0,2978
Electronics/Video Games & Consoles/Games,23.821233,16.0,2562.0,2.520297,3.0,2562
Beauty/Makeup/Eyes,15.046643,12.0,2487.0,1.293526,1.0,2487
"Electronics/Cell Phones & Accessories/Cases, Covers & Skins",13.213506,10.0,2473.0,1.489689,1.0,2473
Women/Underwear/Bras,18.637494,15.0,2091.0,2.003348,2.0,2091
Women/Tops & Blouses/Blouse,15.467804,12.0,2081.0,2.190293,2.0,2081
"Women/Tops & Blouses/Tank, Cami",14.004822,12.0,2074.0,2.101254,2.0,2074


In [18]:
data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].groupby('item_condition_id')['price'].count()

item_condition_id
1    3095
2    1449
3    1312
4      61
5       1
Name: price, dtype: int64

In [19]:
data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].groupby('item_condition_id')['price'].agg([np.mean, np.median, len])

Unnamed: 0_level_0,mean,median,len
item_condition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,38.654766,35.0,3095.0
2,30.806073,25.0,1449.0
3,26.676067,20.0,1312.0
4,19.721311,14.0,61.0
5,4.0,4.0,1.0


In [20]:
vectorizer = TfidfVectorizer( max_df = 0.99, min_df = 10, stop_words = 'english')
vectorizer2 = TfidfVectorizer(max_df = 0.2, min_df = 10, stop_words = 'english')

In [21]:
vectorizer.fit(data.item_description.fillna('unknown')[0:10000])
vectorizer2.fit(data.item_description.fillna('unknown')[0:10000])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=10,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [22]:
print(len(vectorizer.get_feature_names()))
len(vectorizer2.get_feature_names())

2208


2206

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148254 entries, 1065093 to 786483
Data columns (total 8 columns):
train_id             148254 non-null int64
name                 148254 non-null object
item_condition_id    148254 non-null int64
category_name        147588 non-null object
brand_name           84961 non-null object
price                148254 non-null float64
shipping             148254 non-null int64
item_description     148254 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 15.2+ MB


In [None]:
# description_matrix = vectorizer.fit_transform(data.item_description.fillna('unknown'))

# %%time
# name_matrix = vectorizer.fit_transform(data.name.fillna('unknown'))

In [25]:
# name_matrix.shape

In [None]:
# %%time
# nmf = NMF(n_components = 20)
# nmf.fit(name_matrix)

In [None]:
def count_len(cell):
    try: 
        if np.isnan(cell):
            return 0
    except:
        return len(cell)

In [26]:
%%time
[clean_text2(name) for name in data.name]

NameError: name 'clean_text2' is not defined

In [None]:
data['desc_lengths'] = [count_len(item) for item in data.item_description]

In [None]:
data.item_description.iloc[19]

In [None]:
import string

s = data.item_description.iloc[55]+'!'
s.translate(string.punctuation)

In [66]:
nlp = spacy.load('en', disable = ['ner'])

def regex_replace(texts, substitute = '', regex_pattern = r"[^a-zA-z' ]|'s" ):
    
    pattern = re.compile(regex_pattern)
    
    result = []
    for text in texts:
        replaced = pattern.sub(substitute, text)
        replaced = replaced.replace(r"\n", '').replace('  ', ' ').lower().strip()
        result.append(replaced)
    
    return result

def join_not(text):
    replace_these = re.findall(r'not\s+\w+', text)
    for item in replace_these:
        tmp = item.replace(' ', '_')
        text = text.replace(item, tmp)

    return text

def reg_replace(text):
        # remove HTML
    # text = BeautifulSoup(text, "html.parser").get_text()
    
    #removes remaining urls
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
       
    # removes numbers, symbols, nextline symbols 
    text = re.sub(r"[^a-zA-Z' ]"," ", text)   
    text = text.replace(r"\n", '').replace('  ', ' ').lower().strip('.')
    
    # removes some possessive, needs improvement
    text = re.sub(r"'s", "", text)

    return text


def clean_text(text, join = False):
    
    #text = text.replace(r"\n", '').replace('  ', ' ').lower()
    
    if join:
        text = join_not(text)
        
    text = nlp(texts)
    
    # removes punctuation and pronouns, random words, normalizes words by lemmatization
    words=[]
    for word in text:
        if word.pos_ != 'PUNCT' and word.lemma_ != '-PRON-' and not word.is_space:
            if word not in {"x", "s", "v", "'s"}:
                words.append(word.lemma_)

                
def clean_text2(texts, join = False):
    
    if join:
        text = join_not(text)
        
    texts = nlp.pipe(texts, batch_size = 1000)
    
    # removes punctuation and pronouns, random words, normalizes words by lemmatization
    word_lists=[]
    for sent in texts:
        words = []
        for word in sent:
            if (word.pos_ != 'PUNCT' and word.lemma_ != '-PRON-') and \
            (not word.is_space and not word.is_stop):
                if word.lemma_ not in {"x", "s", "v", "'s"}:
                    words.append(word.lemma_)
        word_lists.append(words)
                
    return word_lists

def remove_stops(unigrams, removals=['not'], join = False):
    
    # removes item from set
    if len(removals) > 0:
        for item in removals:
            try:
                spacy.lang.en.STOP_WORDS.remove(item)
            except Exception as error:
                #print(error)
                pass
    
    unigrams = [word for word in unigrams if word not in spacy.lang.en.STOP_WORDS]
    
    if join:
        return ' '.join(unigrams)
    else:
        return unigrams

In [61]:
parsed = nlp('would should stop words is not a fat')

In [62]:
for word in parsed:
    print(word.is_stop)

True
True
False
False
True
True
True
False


In [54]:
'would' in spacy.lang.en.STOP_WORDS

True

In [64]:
clean_text2([r"this would not possibly be kylie's favorite thing sometimes happens if should", 'great times at fast should'])

[['possibly', 'kylie', 'favorite', 'thing', 'happen'],
 ['great', 'time', 'fast']]

In [32]:
desc_list = regex_replace(data.item_description)


In [65]:
%%time
desc_list = regex_replace(data.item_description)
item_descriptions = clean_text2(desc_list)

CPU times: user 42min 10s, sys: 1h 8min 22s, total: 1h 50min 33s
Wall time: 9min 26s


In [88]:
end = regex_replace(data.item_description.iloc[0:2])

In [99]:
data.name[0:10]

1065093                 Kylie Metal Matte Kymajesty
407370                           Joggers camouflage
688213     LuLaRoe Cassie & Irma Bundle - XL/Medium
155188                             Cute Sequins Top
1309673         NEW NYX eyeshadow palette avant pop
565424      Deadrising 3 and a 3 day Xbox live card
1043925                             FULL ZIP HOODIE
1309082                                 9months boy
966563        Black Nike pro spandex shorts size XL
1428151         Rae Dunn Faith Mug And Blessed Bowl
Name: name, dtype: object

In [109]:
data.item_description.iloc[0:10]

1065093    Brand new in box - Never Swatched! Kylie's exo...
407370     Camouflage jogger pants never worn at all boy'...
688213     This gorgeous Cassie has a solid black backgro...
155188     Excellent condition except a lil highlighter m...
1309673    Brand new and sealed NYX eyeshadow palette in ...
565424     It's in a pretty good condition and used but w...
1043925      BRAND NEW VS PINK FULL ZIP HOODIE PRICE IS FIRM
1309082                       Baby boy 9m cloths . No stains
966563     Brand new with tags Black Nike pro spandex sho...
1428151               Rae Dunn Faith mug Blessed bowl bundle
Name: item_description, dtype: object

In [62]:
clean_text2(regex_replace(desc_list[0:5]))

[['brand',
  'new',
  'in',
  'box',
  'never',
  'swatch',
  'kylie',
  "'s",
  'exotic',
  'metal',
  'matte',
  'liquid',
  'lipstick',
  'in',
  'the',
  'shade',
  'kymajesty',
  'come',
  'with',
  'the',
  'kylie',
  'shipping',
  'box',
  'but',
  'no',
  'note',
  'can',
  'provide',
  'proof',
  'of',
  'purchase',
  'in',
  'last',
  'photo',
  'price',
  'firm'],
 ['brand',
  'new',
  'in',
  'box',
  'never',
  'swatch',
  'kylie',
  "'s",
  'exotic',
  'metal',
  'matte',
  'liquid',
  'lipstick',
  'in',
  'the',
  'shade',
  'kymajesty',
  'come',
  'with',
  'the',
  'kylie',
  'shipping',
  'box',
  'but',
  'no',
  'note',
  'can',
  'provide',
  'proof',
  'of',
  'purchase',
  'in',
  'last',
  'photo',
  'price',
  'firm'],
 ['brand',
  'new',
  'in',
  'box',
  'never',
  'swatch',
  'kylie',
  "'s",
  'exotic',
  'metal',
  'matte',
  'liquid',
  'lipstick',
  'in',
  'the',
  'shade',
  'kymajesty',
  'come',
  'with',
  'the',
  'kylie',
  'shipping',
  'box',

In [31]:
%%time
item_descriptions = remove_stops(clean_text2(data.item_description))

CPU times: user 48min 19s, sys: 1h 19min 40s, total: 2h 7min 59s
Wall time: 12min 8s


In [None]:
remove_stops(clean_text2())

In [67]:
%%time
item_descriptions = [remove_stops(clean_text(item)) for item in data.item_description]

NameError: name 'texts' is not defined

In [68]:
%%time
phrase_model = Phraser(Phrases(item_descriptions))
parsed_bigrams = [' '.join(phrase_model[item_descriptions[i]]) for i in range(0,len(item_descriptions))]
description_matrix = vectorizer.fit_transform(parsed_bigrams)

CPU times: user 29.7 s, sys: 22.8 ms, total: 29.7 s
Wall time: 29.7 s


In [318]:
def column_select():
    return data['item_description']

In [321]:
func = make_pipeline(FunctionTransformer(column_select))

In [322]:
func.fit_transform(data)

ValueError: could not convert string to float: 'NWOT size 8, blue/ green stone. Stainless steel.'

In [306]:
make_union(make_pipeline(nmf), make_pipeline(svd))


TypeError: __init__() got an unexpected keyword argument 'kwargs'

In [69]:
%%time
seed = 20
nmf = NMF(n_components = 20, random_state = seed)
description_W = nmf.fit_transform(description_matrix)

svd = TruncatedSVD(n_components = 20, random_state = seed)
description_svd = svd.fit_transform(description_matrix)

CPU times: user 34.3 s, sys: 700 ms, total: 35 s
Wall time: 24.4 s


In [46]:
description_svd.shape

(148254, 20)

In [47]:
type(description_matrix)

scipy.sparse.csr.csr_matrix

In [48]:
description_W.shape

(148254, 20)

In [49]:
data.shape

(148254, 8)

In [209]:
trn_idx, test_idx = train_test_split(range(len(data)), test_size = 0.3, random_state = seed)

## Custom classes

In [301]:
from sklearn.base import BaseEstimator, TransformerMixin

# from sklearn's tutorial
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class categorical_means(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        return None
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None, col_name = 'category_name', response_var = 'price'):
#         stuff = cat_means(X)
        return self.cat_means(X, col_name, response_var)
#         return stuff
    
    def tmp(self, data):
        return data
    
    def cat_means(self, data, col_name, response_var):
    
        grouped_data = data.groupby(col_name)[response_var].agg([len, np.mean, np.median])
        conversion_dict = {name:mean for name, mean in zip(grouped_data[grouped_data.len > 30].index, 
                                                       grouped_data[grouped_data.len > 30]['mean'])}
        converted = [conversion_dict[name] if name in conversion_dict else -1 for name in data[col_name]]

        return converted

In [285]:
cat = categorical_means()

In [291]:
cat.transform(data, col_name = 'brand_name', response_var = 'price')

[-1,
 -1,
 33.92621912602913,
 14.538461538461538,
 12.186708860759493,
 27.672597864768683,
 26.14925235370131,
 13.58641975308642,
 31.509121370067014,
 39.26878130217028,
 -1,
 46.747126436781606,
 -1,
 -1,
 23.19957627118644,
 27.807349665924278,
 -1,
 31.509121370067014,
 26.14925235370131,
 26.14925235370131,
 -1,
 -1,
 -1,
 23.19957627118644,
 -1,
 -1,
 -1,
 32.37905236907731,
 26.14925235370131,
 13.905882352941177,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 12.967824967824967,
 12.534883720930232,
 -1,
 69.18833044482957,
 36.886792452830186,
 -1,
 -1,
 -1,
 16.165754495699765,
 12.536332179930795,
 31.509121370067014,
 -1,
 -1,
 -1,
 -1,
 49.09090909090909,
 -1,
 35.18950064020486,
 -1,
 13.58641975308642,
 52.923857868020306,
 32.34353268428373,
 -1,
 46.29882677708765,
 -1,
 33.92621912602913,
 -1,
 -1,
 86.75352112676056,
 69.18833044482957,
 50.65217391304348,
 -1,
 -1,
 -1,
 -1,
 33.92621912602913,
 16.447552447552447,
 49.54838709677419,
 -1,
 69.18833044482957,
 87.

In [295]:
make_pipeline(cat)

Pipeline(memory=None, steps=[('categorical_means', categorical_means())])

In [180]:
data['category_means'] = cat_group_means(data)

In [190]:

grouped_data = data.groupby('brand_name')['price'].agg([len, np.mean, np.median])

In [198]:
cat_group_means(data, col_name = 'brand_name')

[-1,
 -1,
 33.92621912602913,
 14.538461538461538,
 12.186708860759493,
 27.672597864768683,
 26.14925235370131,
 13.58641975308642,
 31.509121370067014,
 39.26878130217028,
 -1,
 46.747126436781606,
 -1,
 -1,
 23.19957627118644,
 27.807349665924278,
 -1,
 31.509121370067014,
 26.14925235370131,
 26.14925235370131,
 -1,
 -1,
 -1,
 23.19957627118644,
 -1,
 -1,
 -1,
 32.37905236907731,
 26.14925235370131,
 13.905882352941177,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 12.967824967824967,
 12.534883720930232,
 -1,
 69.18833044482957,
 36.886792452830186,
 -1,
 -1,
 -1,
 16.165754495699765,
 12.536332179930795,
 31.509121370067014,
 -1,
 -1,
 -1,
 -1,
 49.09090909090909,
 -1,
 35.18950064020486,
 -1,
 13.58641975308642,
 52.923857868020306,
 32.34353268428373,
 -1,
 46.29882677708765,
 -1,
 33.92621912602913,
 -1,
 -1,
 86.75352112676056,
 69.18833044482957,
 50.65217391304348,
 -1,
 -1,
 -1,
 -1,
 33.92621912602913,
 16.447552447552447,
 49.54838709677419,
 -1,
 69.18833044482957,
 87.

295

In [184]:
data.shape

(148254, 9)

In [186]:
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_means
1065093,1065093,Kylie Metal Matte Kymajesty,1,Beauty/Makeup/Lips,,18.0,1,Brand new in box - Never Swatched! Kylie's exo...,18.425453
407370,407370,Joggers camouflage,2,Kids/Boys (4+)/Bottoms,Arizona,9.0,0,Camouflage jogger pants never worn at all boy'...,16.304029
688213,688213,LuLaRoe Cassie & Irma Bundle - XL/Medium,1,"Women/Skirts/Straight, Pencil",LuLaRoe,54.0,1,This gorgeous Cassie has a solid black backgro...,23.91746
155188,155188,Cute Sequins Top,3,Women/Tops & Blouses/Blouse,Maurices,6.0,1,Excellent condition except a lil highlighter m...,15.467804
1309673,1309673,NEW NYX eyeshadow palette avant pop,1,Beauty/Makeup/Makeup Palettes,NYX,10.0,1,Brand new and sealed NYX eyeshadow palette in ...,24.892575


In [185]:
pd.concat([pd.DataFrame(description_W), pd.DataFrame(description_svd), data.reset_index(drop=True)], axis = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_means
0,5.948241e-06,0.000000e+00,0.016730,0.000000,0.000000,0.000000,0.002280,0.000000,0.000089,0.000000,...,-0.011204,1065093,Kylie Metal Matte Kymajesty,1,Beauty/Makeup/Lips,,18.0,1,Brand new in box - Never Swatched! Kylie's exo...,18.425453
1,4.031833e-06,1.057901e-02,0.000000,0.000000,0.019806,0.000171,0.003199,0.000060,0.001058,0.000000,...,-0.000736,407370,Joggers camouflage,2,Kids/Boys (4+)/Bottoms,Arizona,9.0,0,Camouflage jogger pants never worn at all boy'...,16.304029
2,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.027875,0.000000,0.000530,0.000223,...,0.083694,688213,LuLaRoe Cassie & Irma Bundle - XL/Medium,1,"Women/Skirts/Straight, Pencil",LuLaRoe,54.0,1,This gorgeous Cassie has a solid black backgro...,23.917460
3,0.000000e+00,0.000000e+00,0.000000,0.000000,0.002486,0.000783,0.002675,0.000290,0.000000,0.001962,...,0.045482,155188,Cute Sequins Top,3,Women/Tops & Blouses/Blouse,Maurices,6.0,1,Excellent condition except a lil highlighter m...,15.467804
4,0.000000e+00,0.000000e+00,0.029606,0.000902,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.004855,1309673,NEW NYX eyeshadow palette avant pop,1,Beauty/Makeup/Makeup Palettes,NYX,10.0,1,Brand new and sealed NYX eyeshadow palette in ...,24.892575
5,0.000000e+00,0.000000e+00,0.000048,0.000000,0.000000,0.067948,0.002866,0.000452,0.000000,0.000225,...,0.037116,565424,Deadrising 3 and a 3 day Xbox live card,3,Vintage & Collectibles/Electronics/Video Game,Xbox,14.0,0,It's in a pretty good condition and used but w...,25.758755
6,1.269847e-06,1.827034e-04,0.043298,0.000455,0.000000,0.000000,0.001443,0.000001,0.001055,0.000000,...,0.000709,1043925,FULL ZIP HOODIE,1,Women/Sweaters/Full Zip,PINK,49.0,0,BRAND NEW VS PINK FULL ZIP HOODIE PRICE IS FIRM,26.167203
7,1.359386e-06,7.490856e-06,0.000000,0.000018,0.000325,0.001744,0.001246,0.001030,0.000000,0.000000,...,0.008084,1309082,9months boy,3,Kids/Boys 0-24 Mos/One-Pieces,Carter's,21.0,0,Baby boy 9m cloths . No stains,14.804067
8,2.116197e-06,9.709937e-03,0.020757,0.000000,0.000000,0.000134,0.000000,0.000569,0.023021,0.000000,...,-0.089748,966563,Black Nike pro spandex shorts size XL,1,Women/Athletic Apparel/Shorts,Nike,24.0,0,Brand new with tags Black Nike pro spandex sho...,18.583968
9,0.000000e+00,0.000000e+00,0.000250,0.000014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.047166,1428151,Rae Dunn Faith Mug And Blessed Bowl,1,Home/Kitchen & Dining/Coffee & Tea Accessories,Rae Dunn,26.0,0,Rae Dunn Faith mug Blessed bowl bundle,30.449449


In [70]:
pd.concat([pd.DataFrame(description_W), pd.DataFrame(description_svd), data.reset_index(drop=True)], axis = 1).to_csv('../processed/converted_array.csv')

In [None]:
data.info()

In [None]:
np.isnan('arg')

In [None]:
clean_text(np.nan)

In [None]:

data['cleaned_brands'] = [clean_text(item) for item in data.brand_name.fillna('unknown')]

In [None]:
bigram_dict = {}
for item, score in phrases:
    bigram_dict[item] = score

In [None]:
bigram_dict

## Entity recognition example

In [None]:
doc = """The White House climb-down from President Donald Trump’s disastrous news conference with his Russian counterpart Vladimir Putin began Monday night, just hours after Trump said he saw “no reason” why Russia would have meddled in the 2016 election.

With even The Wall Street Journal’s editorial board — normally intensely loyal to Trump — joining in widespread criticism of the president’s implicit public rejection of U.S. intelligence claims, the White House circulated talking points to supporters saying that Trump still had great confidence in his intelligence agencies and that he believed their assessment that the Kremlin actively influenced the vote.

But the president himself emerged on Tuesday to personally walk back his statements in Helsinki, using a scheduled meeting with members of Congress to discuss tax reform as a platform for revising the statements that set off a 24-hour firestorm.

“In a key sentence in my remarks, I said the word ‘would’ instead of ‘wouldn’t,’” Trump said. “The sentence should have been — and I thought it would be maybe a little bit unclear on the transcript or unclear on the actual video — the sentence should have been: I don’t see any reason why it wouldn't be Russia. Sort of a double negative.”

"""

In [None]:
doc = nlp(doc)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
sns.distplot(data.desc_lengths)

In [None]:
sns.distplot(sample)

In [None]:
?lognorm

In [None]:
# fit a gamma dist
fit_alpha, fit_loc, fit_beta=gamma.fit(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)
print(fit_alpha, fit_loc, fit_beta)

In [None]:
gamma.fit(list(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price))

In [None]:
lognorm._fitstart(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)

In [None]:
lognorm.fit(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price)

In [None]:
?lognorm.fit

In [None]:
?kstest

In [None]:
ks_2samp(np.array(data.price[data.brand_name.isnull()]), np.array(data.price[data.brand_name.notnull()]))

In [None]:
# non-parametric indepedent samples
mannwhitneyu(data.price[data.brand_name.isnull()], 
                       data.price[data.brand_name.notnull()])

In [None]:
sns.distplot(data[data.category_name=='Women/Athletic Apparel/Pants, Tights, Leggings'].price, bins = 100)
plt.xlim(0, 200)

In [None]:
pd.pivot_table(data, values = 'price', index = 'brand_name', columns = 'shipping', aggfunc=(np.median, len)).columns

In [None]:
# tuple to select multi-level index
pd.pivot_table(data, values = 'price', index = 'brand_name', columns = 'shipping', aggfunc=(np.median, len))[('len', 0)]

In [None]:
brand_tbl = pd.pivot_table(data, values = 'price', index = 'brand_name', aggfunc=('mean', np.median, len), dropna=False).sort_values('len', ascending=False)
brand_tbl[brand_tbl.len > 30].sort_values('median', ascending=False)

In [None]:
data.brand_name.isnull().sum()

In [None]:
data.brand_name.value_counts(dropna=False, normalize=True)

In [None]:
null_proportion = data.brand_name.value_counts(dropna=False, normalize=True)
1- data.brand_name.value_counts(dropna=False, normalize=True)[0]

In [None]:
data['brand_name_null'] = data.brand_name.isnull()*1
sns.kdeplot(data.price[data.brand_name.notnull()])

In [None]:


fig, ax = plt.subplots(sharex=True, sharey=True)

for data, label in zip([data.price[data.brand_name.isnull()], 
                       data.price[data.brand_name.notnull()]], 
                      ["0", "1"]):
     sns.kdeplot(data,  ax=ax, label = label, shade = True)
     #sns.distplot(data,  ax=ax, label = label)
   
#ax.set_xlim([-5, 250])

plt.show()

In [None]:
data.price.describe()

In [None]:
np.median(data.price)

In [None]:
pd.pivot_table(data.fillna(-1), values = 'price', index = 'brand_name', aggfunc=(np.median, len), dropna=False).sort_values('len', ascending=False)

In [None]:
# picks categorical columns, check for diff datasets
def get_categoricals(data):

    categorical_columns = []
    for dtype, idx in zip(data.dtypes, data.dtypes.index):
        if dtype == 'object':
            categorical_columns.append(idx)
    return categorical_columns

def encode_column(train_column, test_column, fillna = False):
    
    le = LabelEncoder()
    
    # le does not work with nan
    train_column = train_column.fillna('unknown')
    test_column = test_column.fillna('unknown')
    le.fit(train_column)
    train_column_le = le.transform(train_column)
    test_column_le = le.transform(test_column)
    translation_dict = {i:item for i, item in enumerate(le.classes_)}
    return train_column_le, test_column_le, translation_dict

# categorical to column arrays wide
def label_convert(train_column, test_column):
    
    encoder = LabelBinarizer()
    label_model = encoder.fit(train_column.fillna('unknown'))
    converted_train = label_model.transform(train_column.fillna('unknown'))
    converted_test = label_model.transform(test_column.fillna('unknown'))
    translation_dict = {i:item for i, item in enumerate(label_model.classes_)}
    return converted_train, converted_test, translation_dict

# keep cats in one column
def label_convert_cols(train, test):
    cols = get_categoricals(train)
    translation_dicts = {}
    train_column_stacks = []
    test_column_stacks = []
    for item in cols:
        train_column_labels, test_column_labels, translation_dict = encode_column(train[item], test[item])
        train_column_stacks.append(train_column_labels)
        test_column_stacks.append(test_column_labels)
        translation_dicts[item] = translation_dict
    train_arrays = np.vstack(train_column_stacks).T
    test_arrays = np.vstack(test_column_stacks).T
    return translation_dicts, train_arrays, test_arrays 

# converts all categorical columns into sparse format
def convert_columns(train, test, cols):
    
    translation_dicts = {}
    converted_train_arrays = []
    converted_test_arrays = []
    for column in cols:
        converted_train, converted_test, translation_dict = label_convert(train[column], test[column])
        translation_dicts[column] = translation_dict
        converted_train_arrays.append(converted_train)
        converted_test_arrays.append(converted_test)
    stacked_train_arrays = np.hstack(converted_train_arrays)
    stacked_test_arrays = np.hstack(converted_test_arrays)
    return translation_dicts, stacked_train_arrays, stacked_test_arrays

def process_categoricals(train, test):

    columns = get_categoricals(train)
    category_dicts, train_arrays, test_arrays = convert_columns(train, test, columns)
    return category_dicts, train_arrays, test_arrays

def combine_num_cats(train, test, wide=True):

    # combines numerical and categorical columns    
    
    column_dict = {}
    cols = get_categoricals(train)
    num_cols = [item for item in train.columns if item not in cols]
    for item in num_cols:
        column_dict[item]='numerical_column'
    if wide:    
        category_dicts, train_arrays, test_arrays = process_categoricals(train, test)
    else:
        category_dicts, train_arrays, test_arrays = label_convert_cols(train, test)

    # ** unpacks dict and allows extending dict
    combined_dict = {**column_dict, **category_dicts}
    train_matrix = np.hstack([train[num_cols].values, train_arrays])
    test_matrix = np.hstack([test[num_cols].values, test_arrays])
    
    return combined_dict, train_matrix, test_matrix


# needs fix for columns of 0 or 1 like sex# needs f 
def flatten_cols(column_dict):
    
    flattened_columns = {}
    f = 0
    
    for item in column_dict.items():    
        if type(item[1]) is dict:
            if len(item[1]) > 2:
                for name in item[1].values():
                    flattened_columns.update({f:item[0] + '_' + name})
                    f = f + 1
            else:
                flattened_columns.update({f:item[0]+ '_' + list(item[1].values())[1]})
                f = f + 1
        else:
            flattened_columns.update({f:item[0]})
            f = f + 1
    return flattened_columns

# def encode_column(column, fillna = False):
    
#     le = LabelEncoder()
    
#     # le does not work with nan, replace with string unknown
#     column = column.fillna('unknown')
#     le.fit(column)
#     column_le = le.transform(column)
    
#     if fillna == False:
#         # replace unknown with nan, nan must replace float value
#         column_le = column_le.astype('float')
#         idx = [i for i, value in enumerate(column_le) if column_le[i]=='unknown']
#         column_le[idx] = np.nan

#     zip_obj = zip(list(column), column_le)
#     return column_le, dict(zip_obj)

In [None]:
data.shape

In [None]:
brand_name_miss = {name for name in data.brand_name if name not in test.brand_name}

In [None]:
len(brand_name_miss)

In [None]:
brands_le, brand_dict = encode_column(data.brand_name, test.brand_name, fillna = False)
category_le, category_dict = encode_column(data.category_name, test.category_name, fillna = False)

In [None]:
data['brands_le'] = brands_le
data['category_le'] = category_le

In [None]:
data_x = data[['item_condition_id', 'category_le', 'brands_le', 'shipping']].as_matrix()
data_y = data.price.values

In [None]:
np.array(data.price)

In [None]:
?xgb.DMatrix

In [None]:
np.isnan(data_x).sum()

In [None]:
np.any(1)

In [None]:
def log_fix(array):
    
    tmp = np.zeros(len(array))
    
    for i, number in enumerate(array):
        if not np.any(number):
            tmp[i] = 0
        else:
            tmp[i] = np.log(number)
    return tmp        
        

In [None]:
data_y_ln = log_fix(data_y)

In [None]:
data_y_ln[0:10]

In [None]:
# dataing data splitting function
# def data_split(data, proportion = .2):
#     cutoff = round(len(data) * (1-proportion))
#     return data[0:cutoff], data[cutoff:]

train_x, val_x = train_test_split(data_x, data_y, test_size = .3, random_state = 42)
train_y, val_y = data_split(data_x, data_y_ln, test_size = .3)


In [None]:
train_x = xgb.DMatrix(train_x, train_y)
train_ln = xgb.DMatrix(train_x, train_y_ln)

In [None]:
np.count_nonzero(~np.isnan(data_x))


In [35]:
params = {'min_child_weight': 20, 'max_depth': 7,
            'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'auto'}

model = xgb.data(params, ddata, 100)



AttributeError: module 'xgboost' has no attribute 'data'

In [None]:
model.get_score(importance_type='gain').values()

In [None]:
data

In [None]:
sns.barplot(x=list(model.get_score(importance_type='gain').keys()), y=list(model.get_score(importance_type='gain').values()))

In [None]:
pd.DataFrame(data={'a':[2,3]})

In [None]:
model_ln=xgb.data(params, ddata_ln, 100)

In [None]:
??xgb.data

In [None]:
dtest = xgb.DMatrix(data_x[0:10000])
true = data_y[0:10000].astype(float)

In [None]:
help(model)

In [None]:
# for linear non-transformed
print(mean_squared_error(val_data_y, model.predict(val_data_x)))

print(mean_squared_error(true, np.exp(model_ln.predict(dtest))))

In [None]:
model.predict(dtest)