In [134]:
import numpy as np
import pandas as pd

In [135]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [136]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [137]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


## Handle missing values

In [139]:
df.dropna(inplace=True)
df.drop('id', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221259 entries, 0 to 221258
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      221259 non-null  object 
 1   country   221259 non-null  object 
 2   store     221259 non-null  object 
 3   product   221259 non-null  object 
 4   num_sold  221259 non-null  float64
dtypes: float64(1), object(4)
memory usage: 8.4+ MB


In [140]:
df.describe()

Unnamed: 0,num_sold
count,221259.0
mean,752.527382
std,690.165445
min,5.0
25%,219.0
50%,605.0
75%,1114.0
max,5939.0


## Test Data

In [141]:
df_test = pd.read_csv('test.csv')
df_test.drop('id', axis=1, inplace=True)

## Quantify Data

In [142]:
# Train
df.date = df.date.str.split('-', expand=True)[0] + df.date.str.split('-', expand=True)[1] + df.date.str.split('-', expand=True)[2]
df.date = df.date.astype(int)

In [143]:
# Test
df_test.date = df_test.date.str.split('-', expand=True)[0] + df_test.date.str.split('-', expand=True)[1] + df_test.date.str.split('-', expand=True)[2]
df_test.date = df_test.date.astype(int)

In [144]:
df.head(3)

Unnamed: 0,date,country,store,product,num_sold
0,20100101,Canada,Discount Stickers,Kaggle,973.0
1,20100101,Canada,Discount Stickers,Kaggle Tiers,906.0
2,20100101,Canada,Discount Stickers,Kerneler,423.0


In [145]:
df['product'].value_counts().to_frame()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
Kaggle,46026
Kaggle Tiers,46026
Kerneler Dark Mode,46025
Kerneler,45962
Holographic Goose,37220


In [146]:
df = df.join(pd.get_dummies(df['country'], prefix = 'country').astype(int)).drop('country', axis=1)
df = df.join(pd.get_dummies(df['store'], prefix = 'store').astype(int)).drop('store', axis=1)
df = df.join(pd.get_dummies(df['product'], prefix = 'product').astype(int)).drop('product', axis=1)

In [147]:
# Test
df_test = df_test.join(pd.get_dummies(df_test['country'], prefix = 'country').astype(int)).drop('country', axis=1)
df_test = df_test.join(pd.get_dummies(df_test['store'], prefix = 'store').astype(int)).drop('store', axis=1)
df_test = df_test.join(pd.get_dummies(df_test['product'], prefix = 'product').astype(int)).drop('product', axis=1)

In [148]:
# Correct dtypes
df['num_sold'] = df['num_sold'].astype(int)
df.head()

Unnamed: 0,date,num_sold,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,store_Discount Stickers,store_Premium Sticker Mart,store_Stickers for Less,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,product_Kerneler Dark Mode
0,20100101,973,1,0,0,0,0,0,1,0,0,0,1,0,0,0
1,20100101,906,1,0,0,0,0,0,1,0,0,0,0,1,0,0
2,20100101,423,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,20100101,491,1,0,0,0,0,0,1,0,0,0,0,0,0,1
4,20100101,300,1,0,0,0,0,0,0,0,1,1,0,0,0,0


In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221259 entries, 0 to 221258
Data columns (total 16 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   date                        221259 non-null  int32
 1   num_sold                    221259 non-null  int32
 2   country_Canada              221259 non-null  int32
 3   country_Finland             221259 non-null  int32
 4   country_Italy               221259 non-null  int32
 5   country_Kenya               221259 non-null  int32
 6   country_Norway              221259 non-null  int32
 7   country_Singapore           221259 non-null  int32
 8   store_Discount Stickers     221259 non-null  int32
 9   store_Premium Sticker Mart  221259 non-null  int32
 10  store_Stickers for Less     221259 non-null  int32
 11  product_Holographic Goose   221259 non-null  int32
 12  product_Kaggle              221259 non-null  int32
 13  product_Kaggle Tiers        221259 non-null 

In [150]:
df.corr()['num_sold']

date                         -0.040564
num_sold                      1.000000
country_Canada                0.054136
country_Finland              -0.002843
country_Italy                -0.139527
country_Kenya                -0.449873
country_Norway                0.444573
country_Singapore             0.073334
store_Discount Stickers      -0.325233
store_Premium Sticker Mart    0.231209
store_Stickers for Less       0.089933
product_Holographic Goose    -0.361666
product_Kaggle                0.356331
product_Kaggle Tiers          0.197389
product_Kerneler             -0.145131
product_Kerneler Dark Mode   -0.075360
Name: num_sold, dtype: float64

## RF Model

In [151]:
X = df.drop('num_sold', axis=1)
y = df['num_sold']

X_test = df_test

In [152]:
rf = RandomForestRegressor(n_estimators=30 ,max_depth=15).fit(X, y)
predictions = rf.predict(X_test)

# y_pred = rf.predict(X_test)
# mean_absolute_percentage_error(y_test, y_pred)

In [159]:
def make_submission(prediction, sub_name):
  my_submission = pd.DataFrame({'id':pd.read_csv('test.csv').id,'num_sold':prediction})
  my_submission.to_csv('{}.csv'.format(sub_name),index=False)
  print('A submission file has been made')

make_submission(predictions.astype(int),'submission(rf)')

A submission file has been made
