In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)


from scipy import stats
from scipy.stats import norm, skew #for some statistics


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

In [2]:
"""
USEFUL PLOT FUNCTIONS

plot_histograms(df,variables,n_rows,n_cols)
plot_distribution(df,var,target,**kwargs)
plot_categories(df,cat,target,**kwargs)
plot_correlation_map(df)
describe_more(df)
plot_variable_importance(x,y)
plot_model_var_imp(model,x,y)

"""

def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

In [3]:
train = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
items = pd.read_csv("items.csv")
itemsCat = pd.read_csv("item_categories.csv")
shops = pd.read_csv("shops.csv")

In [4]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [54]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [60]:
itemsCat.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [61]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [16]:
negativeNums = train[train["item_price"] <= 0]
negativeNums

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,15.05.2013,4,32,2973,-1.0,1.0


In [5]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [51]:
negativeNums = train[train["item_cnt_day"] <= 0]
len(negativeNums)

7356

In [103]:
#I have to combine the type of category each item is to the item
train = train.merge(items, left_on='item_id', right_on='item_id', how='outer')
train = train.merge(itemsCat, left_on='item_category_id', right_on='item_category_id', how='outer')
train = train.merge(shops, left_on='shop_id', right_on='shop_id', how='outer')
train.drop(["item_id","item_category_id","shop_id"],axis=1, errors="ignore",inplace=True)
train.head()

Unnamed: 0,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name
0,02.01.2013,0.0,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,26.04.2013,3.0,150.0,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
2,26.06.2013,5.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
3,20.07.2013,6.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
4,14.09.2013,8.0,299.0,2.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""


In [75]:
train.item_category_name.unique()

array(['Кино - Blu-Ray', 'Музыка - Винил',
       'Музыка - CD фирменного производства', 'Музыка - Музыкальное видео',
       'Музыка - CD локального производства', 'Игры - XBOX 360',
       'Игры - PS3', 'Игры PC - Дополнительные издания',
       'Игры PC - Стандартные издания', 'Игры - PSP', 'Кино - DVD',
       'Программы - Для дома и офиса', 'Книги - Методические материалы 1С',
       'Игры PC - Коллекционные издания', 'Игры - PSVita',
       'Подарки - Развитие', 'Программы - 1С:Предприятие 8',
       'Программы - Обучающие', 'Музыка - MP3',
       'Музыка - Подарочные издания', 'Аксессуары - PSP',
       'Подарки - Гаджеты, роботы, спорт', 'Книги - Аудиокниги',
       'Игровые консоли - XBOX 360', 'Аксессуары - PS3',
       'Аксессуары - PS4', 'Аксессуары - PSVita', 'Карты оплаты - PSN',
       'Карты оплаты - Live!', 'Аксессуары - XBOX 360',
       'Кино - Blu-Ray 3D', 'Игры - Аксессуары для игр',
       'Игровые консоли - PSVita', 'Книги - Аудиокниги 1С',
       'Кино - Коллекц

google translate:

'Cinema - Blu-Ray', 'Music - Vinyl',
       'Music - CD of brand production', 'Music - Music video',
       'Music - CD of local production', 'Games - XBOX 360',
       'Games - PS3', 'PC Games - Additional Publications',
       'PC Games - Standard Edition', 'Games - PSP', 'Cinema - DVD',
       'Programs - For Home and Office', 'Books - Methodical Materials 1C',
       'PC games - Collector's editions', 'Games - PSVita',
       'Gifts - Development', 'Programs - 1C: Enterprise 8',
       'Programs - Teaching', 'Music - MP3',
       'Music - Gift Edition', 'Accessories - PSP',
       'Gifts - Gadgets, robots, sports', 'Books - Audiobooks',
       'Game consoles - XBOX 360', 'Accessories - PS3',
       'Accessories - PS4', 'Accessories - PSVita', 'Payment cards - PSN',
       'Payment cards - Live!', 'Accessories - XBOX 360',
       'Cinema - Blu-Ray 3D', 'Games - Accessories for games',
       'Game consoles - PSVita', 'Books - Audiobooks 1C',
       'Cinema - Collectible', 'Gifts - Postcards, stickers',
       'Game consoles - PS3', 'Gifts - Souvenirs',
       'Gifts - Board games (compact)',
       'Gifts - Board games', 'Official',
       'Gifts - Soft Toys', 'Gifts - Souvenirs (in a sample)',
       'Game consoles - PSP', 'Batteries', 'Gifts - Figures',
       'Gifts - Attributes', 'Gifts - Bags, Albums, Mouse Rugs',
       'Payment cards (Movies, Music, Games)', 'Movies - Blu-Ray 4K',
       'Game consoles - PS4', 'Games - PS4', 'Game consoles - Other',
       'Tickets (Number)', 'Game Consoles - XBOX ONE', 'Games - XBOX ONE',
       'Accessories - XBOX ONE', 'Books - Comics, manga',
       'Books - Artbooks, encyclopedias', 'Official - Tickets',
       'Gifts - Certificates, services', 'Clean media (piece)',
       'Pure media (spire)', 'PC - Headsets / Headphones',
       'Books - Business Literature', 'Books - Fiction',
       'Games - PS2', 'Books - Cognitive Literature',
       'Books - Computer Literature', 'Accessories - PS2',
       'Books - Postcards', 'Delivery of goods', 'Game consoles - PS2',
       'Books - Guides', 'PC Games - Number',
       'Programs - Home and Office (Figure)',
       'Programs - Teaching (Number)', 'Books - Number',
       'Books - Audiobooks (Figure)', 'Games MAC - Number',
       'Payment cards - Windows (Number)', 'Payment cards - Live! (Numeral)',
       'Programs - MAC (Number)', 'Android games - Number'

Each item is split up into two types of categories.

I should also note that someone buying a ps2 might also buy its accessories.
format: broad category, sub category

How do I deal with:
'Gifts - Board games (compact)',
'Gifts - Board games', 'Official',
and
'Music - CD of brand production'
'Music - CD of local production'

In [108]:
import math
def get_second(strings):
    if isinstance(strings, str):
        temp = strings.split(" - ")
        if len(temp) > 1:
            return temp[1].strip()
        return "NA"
    return "NA"
def get_first(strings):
    if isinstance(strings, str):
        temp = strings.split(" - ")
        return temp[0].strip()
    return "NA"

In [109]:
train["cat1"] = train.item_category_name.map(lambda x: x.split(" - ")[0].strip())
train["cat2"] = train.item_category_name.map(lambda x: get_second(x))

In [130]:
train.head()

Unnamed: 0,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name,cat1,cat2
0,02.01.2013,0.0,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray
1,26.04.2013,3.0,150.0,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray
2,26.06.2013,5.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray
3,20.07.2013,6.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray
4,14.09.2013,8.0,299.0,2.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray


In [110]:
test = test.merge(items, left_on='item_id', right_on='item_id', how='outer')
test = test.merge(itemsCat, left_on='item_category_id', right_on='item_category_id', how='outer')
test = test.merge(shops, left_on='shop_id', right_on='shop_id', how='outer')
test.drop(["item_id","item_category_id","shop_id"],axis=1, errors="ignore",inplace=True)
test["cat1"] = test.item_category_name.map(lambda x: get_first(x))
test["cat2"] = test.item_category_name.map(lambda x: get_second(x))
test.head()

Unnamed: 0,ID,item_name,item_category_name,shop_name,cat1,cat2
0,0.0,"NHL 15 [PS3, русские субтитры]",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3
1,2.0,"Need for Speed Rivals (Essentials) [PS3, русск...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3
2,15.0,"Minecraft. Playstation 3 Edition [PS3, русская...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3
3,21.0,"NBA 2K16 [PS3, русская документация]",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3
4,23.0,"Plants vs. Zombies Garden Warfare [PS3, русска...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3


In [118]:
print(str(len(test[test.cat1.map(lambda x: x == "NA")])*100/len(test))+"%")
print(str(len(test[test.cat2.map(lambda x: x == "NA")])*100/len(test))+"%")
print(str(len(train[train.cat2.map(lambda x: x == "NA")])*100/len(train))+"%")

0.007782504928919788%
0.26330808342845285%
1.035449756352743%


In [119]:
train.shop_name.unique()

array(['Ярославль ТЦ "Альтаир"', 'Москва ТК "Буденовский" (пав.К7)',
       'Москва ТЦ "МЕГА Белая Дача II"', 'Москва ТРК "Атриум"',
       'Воронеж (Плехановская, 13)', 'Калуга ТРЦ "XXI век"',
       'Воронеж ТРЦ "Максимир"', 'Москва ТЦ "Семеновский"',
       'Химки ТЦ "Мега"', 'СПб ТК "Невский Центр"', 'Омск ТЦ "Мега"',
       'Новосибирск ТЦ "Мега"', 'Сергиев Посад ТЦ "7Я"',
       'Самара ТЦ "Мелодия"', 'Тюмень ТЦ "Зеленый Берег"',
       'Коломна ТЦ "Рио"', '!Якутск ТЦ "Центральный" фран',
       '!Якутск Орджоникидзе, 56 фран', 'Москва ТЦ "МЕГА Теплый Стан" II',
       'Якутск Орджоникидзе, 56', 'Москва ТК "Буденовский" (пав.А2)',
       'Якутск ТЦ "Центральный"', 'Чехов ТРЦ "Карнавал"',
       'Н.Новгород ТРЦ "Фантастика"', 'Сургут ТРЦ "Сити Молл"',
       'Москва МТРЦ "Афи Молл"', 'Москва Магазин С21',
       'Курск ТЦ "Пушкинский"', 'Красноярск ТЦ "Июнь"',
       'Воронеж ТРЦ Сити-Парк "Град"', 'Москва ТЦ "Перловский"',
       'РостовНаДону ТЦ "Мега"', 'Самара ТЦ "ПаркХаус"',


google translate:

'Yaroslavl ТЦ "Альтаир"', 'Moscow ТК "Буденовский" (пав.К7)',
       'Moscow TC "MEGA Belaya Dacha II", Moscow TRK Atrium,
       'Voronezh (Plekhanovskaya, 13)', 'Kaluga SEC "XXI century",
       'Voronezh SEC "Maksimir"', 'Moscow TC "Semenovsky"',
       'Khimki TC Mega', SPb TC Nevsky Center, Omsk TC Mega,
       'Novosibirsk ТЦ "Мега"', 'Sergiev Posad ТЦ "7Я"',
       'Samara ТЦ "Melody"', 'Tyumen ТЦ "Green Coast"',
       'Kolomna shopping center "Rio", Yakutsk Shopping center "Central" fr',
       '! Yakutsk Ordzhonikidze, 56 fr', 'Moscow TC "MEGA Teply Stan" II',
       'Yakutsk Ordzhonikidze, 56', 'Moscow TC Budenovskiy (Pavilion A2)',
       'Yakutsk TC "Central", Chekhov SEC "Carnival",
       'N.Novgorod SEC "Fantastica", Surgut SEC "City Mall",
       'Moscow МТРЦ "Афи Молл"', 'Москва Магазин С21',
       'Kursk TC "Pushkinsky", "Krasnoyarsk TC" June ",
       'Voronezh Shopping and entertainment center City-Park Grad, Moscow Transform center,
       'RostovNaDonu TC "Mega"', 'Samara Shopping Center' ParkHaus '',
       'Moscow ТЦ "Серебряный Дом"', 'Moscow ТЦ "Новый век" (Новокосино)',
       'Tyumen TC Goodwin', Ufa TC Family 2, SPb TK Sennaya,
       'Krasnoyarsk shopping center "Vzletka Plaza", Adygeya shopping center Mega,
       'Ufa TK "Tsentralny", Kazan "TC" Behetle ",
       Kazan shopping center ParkHouse II, Balashikha TRK October-Kinomir,
       'Zhukovsky st. Chkalov 39m? ',' Online store ChS ',
       'Moscow ТЦ "Areal" (Belyaevo)', 'Vologda ТРЦ "Marmalade"',
       Volzhsky shopping center Volga Moll, Rostov-na-Donu shopping and entertainment center Megacenter Horizon,
       'Tomsk SEC "Emerald City", "Tyumen SEC" Crystal ",
       'N.Novgorod TRC "RIO"', Zhukovsky st. Chkalov 39m²,
       'Novosibirsk TRC "Gallery Novosibirsk", Mytischi TRK XL-3,
       'Outbound Trade',
       'RostovNaDonu TRK "Megacenter Horizon" Island ", nan,
       'Moscow' Sale '', 'Digital warehouse 1C-Online'
       
notable mentions:

can group Digital warehouse 1C-Online **with** Online store ChS

can group all shops Moscow, Voronezh, Yakutsk, SEC, TC, TRK, ТЦ

can group Voronezh Shopping and entertainment center **with** Rostov-na-Donu shopping and entertainment center

can group Rostov-na-Donu shopping and entertainment center **with** RostovNaDonu TRK "Megacenter Horizon" Island "


I have an idea: I get a bag of all the unique words. Then I one hot encode everything and tick off the columns that match the OHE instead of manually coding each case

In [131]:
train_store_names = train.shop_name
test_store_names = test.shop_name
all_shop_names = []
for x in train_store_names + test_store_names:
    if isinstance(x, str):
        temp = x.split(" ")
        for a in temp:
            all_shop_names.append(a)
all_shop_names = list(set(all_shop_names))
all_shop_names[:7]

['II"Чехов',
 '"Гудвин"',
 '"Атриум"Жуковский',
 '"Волга',
 'Стан"',
 '"Новый',
 'Магазин']

In [135]:
#making a shop_cat variable

for x in all_shop_names:
    train[x] = 0
train.head()

Unnamed: 0,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name,cat1,cat2,"II""Чехов",...,"""МЕГА","Орджоникидзе,",Москва,"""Альтаир""",ТРК,"""Атриум""Воронеж","""Центральный""","""Сенная""",ТРЦ,фран
0,02.01.2013,0.0,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,0,0,0,0,0,0,0,0,0,0
1,26.04.2013,3.0,150.0,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,0,0,0,0,0,0,0,0,0,0
2,26.06.2013,5.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,0,0,0,0,0,0,0,0,0,0
3,20.07.2013,6.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,0,0,0,0,0,0,0,0,0,0
4,14.09.2013,8.0,299.0,2.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
copy_train = train
for index, row in copy_train.iterrows():
    the_shop_names = row["shop_name"]
    if isinstance(the_shop_names, str):
        the_shop_names = the_shop_names.split(" ")
        for x in the_shop_names:
            train.set_value(index,x, 1)
copy_train.head()

Unnamed: 0,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name,cat1,cat2,"II""Чехов",...,Казань,Балашиха,Жуковский,Интернет-магазин,Вологда,Волжский,Томск,Мытищи,Выездная,Цифровой
0,02.01.2013,0.0,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,,,,,,,,,,
1,26.04.2013,3.0,150.0,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,,,,,,,,,,
2,26.06.2013,5.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,,,,,,,,,,
3,20.07.2013,6.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,,,,,,,,,,
4,14.09.2013,8.0,299.0,2.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,0,...,,,,,,,,,,


In [141]:
copy_test = test
for index, row in copy_test.iterrows():
    the_shop_names = row["shop_name"]
    if isinstance(the_shop_names, str):
        the_shop_names = the_shop_names.split(" ")
        for x in the_shop_names:
            test.set_value(index,x, 1)
copy_test.head()

Unnamed: 0,ID,item_name,item_category_name,shop_name,cat1,cat2,Вологда,ТРЦ,"""Мармелад""",Волжский,...,"""Перловский""","""Серебряный","Дом""",Мытищи,"""XL-3""",Островной,"""Сенная""","""Зеленый","Берег""",Химки
0,0.0,"NHL 15 [PS3, русские субтитры]",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3,1.0,1.0,1.0,,...,,,,,,,,,,
1,2.0,"Need for Speed Rivals (Essentials) [PS3, русск...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3,1.0,1.0,1.0,,...,,,,,,,,,,
2,15.0,"Minecraft. Playstation 3 Edition [PS3, русская...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3,1.0,1.0,1.0,,...,,,,,,,,,,
3,21.0,"NBA 2K16 [PS3, русская документация]",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3,1.0,1.0,1.0,,...,,,,,,,,,,
4,23.0,"Plants vs. Zombies Garden Warfare [PS3, русска...",Игры - PS3,"Вологда ТРЦ ""Мармелад""",Игры,PS3,1.0,1.0,1.0,,...,,,,,,,,,,


In [143]:
copy_train.to_csv("copy_train.csv")
copy_test.to_csv("copy_train.csv")

In [3]:
a = pd.read_csv("copy_train.csv")
b = pd.read_csv("copy_train.csv")

In [4]:
a.head()

Unnamed: 0.1,Unnamed: 0,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name,cat1,cat2,...,Казань,Балашиха,Жуковский,Интернет-магазин,Вологда,Волжский,Томск,Мытищи,Выездная,Цифровой
0,0,02.01.2013,0.0,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,...,,,,,,,,,,
1,1,26.04.2013,3.0,150.0,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,...,,,,,,,,,,
2,2,26.06.2013,5.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,...,,,,,,,,,,
3,3,20.07.2013,6.0,199.5,1.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,...,,,,,,,,,,
4,4,14.09.2013,8.0,299.0,2.0,2012 (BD),Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",Кино,Blu-Ray,...,,,,,,,,,,


## Dates

cool link I found that describes how to do EDA with dates: https://datascience.stackexchange.com/questions/2368/machine-learning-features-engineering-from-date-time-data

I have to dummy the item_category_id
should I remove the months that aren't november? bc there is black friday that'll skew results
also cyber monday. It'll be interesting to see which shops 
it also seems like the data is pretty filled
I cna't just have a simgle model for every item because that doesn't keep into account the overall patterns of the month

In [24]:
train['year']=[d.split('.')[2] for d in train.date]
train['month']=[d.split('.')[1] for d in train.date]
train['day']=[d.split('.')[0] for d in train.date]

In [25]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
0,02.01.2013,0,59,22154,999.0,1.0,2013,1,2
1,03.01.2013,0,25,2552,899.0,1.0,2013,1,3
2,05.01.2013,0,25,2552,899.0,-1.0,2013,1,5
3,06.01.2013,0,25,2554,1709.05,1.0,2013,1,6
4,15.01.2013,0,25,2555,1099.0,1.0,2013,1,15


In [26]:
train.drop("date",inplace = True,axis=1,errors="ignore")
train.rename(columns=lambda x: x.lower(), inplace=True)
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
0,0,59,22154,999.0,1.0,2013,1,2
1,0,25,2552,899.0,1.0,2013,1,3
2,0,25,2552,899.0,-1.0,2013,1,5
3,0,25,2554,1709.05,1.0,2013,1,6
4,0,25,2555,1099.0,1.0,2013,1,15


In [27]:
#bad because it OHE the diff dates
#train = pd.get_dummies(theData)
train.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
       'date_01.01.2013', 'date_01.01.2014', 'date_01.01.2015',
       'date_01.02.2013', 'date_01.02.2014',
       ...
       'date_31.07.2014', 'date_31.07.2015', 'date_31.08.2013',
       'date_31.08.2014', 'date_31.08.2015', 'date_31.10.2013',
       'date_31.10.2014', 'date_31.10.2015', 'date_31.12.2013',
       'date_31.12.2014'],
      dtype='object', length=1039)

I have a problem here. Most items in all of the stores have 0 sold for the majority of the day. So this is hard to model.

In [None]:
sns.distplot(train['item_cnt_day'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['item_cnt_day'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['item_cnt_day'], plot=plt)
plt.show()

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train['item_cnt_day'] = np.log1p(train['item_cnt_day'])

#Check the new distribution 
sns.distplot(train['item_cnt_day'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['item_cnt_day'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['item_cnt_day'], plot=plt)
plt.show()