In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pandas_profiling
from sklearn.preprocessing import OneHotEncoder

In [111]:
df = pd.read_csv('Dataset/Train.csv')
df_test = pd.read_csv('Dataset/Test.csv')

In [112]:
df_test.shape

(5763, 8)

In [113]:
df.drop(['Item_Id'], axis= 1, inplace=True)
df_test.drop(['Item_Id'], axis= 1, inplace=True)

In [114]:
df['Year'] = pd.to_datetime(df.Date).dt.year
df_test['Year'] = pd.to_datetime(df_test.Date).dt.year

In [115]:
df.drop(['Date'], axis= 1, inplace=True)
df_test.drop(['Date'], axis= 1, inplace=True)

In [116]:
top_mktcat = df.Market_Category.value_counts()[:50]

df['Market_Category_Updated'] = df['Market_Category'].apply(lambda x: str(x) if x in top_mktcat else 'others')
df_test['Market_Category_Updated'] = df_test['Market_Category'].apply(lambda x: str(x) if x in top_mktcat else 'others')

In [117]:
df.State_of_Country = df.State_of_Country.astype(str)#.astype('category')
df.Market_Category = df.Market_Category.astype(str)#.astype('category')
df.Product_Category = df.Product_Category.astype(str)#.astype('category')

df_test.State_of_Country = df_test.State_of_Country.astype(str)#.astype('category')
df_test.Market_Category = df_test.Market_Category.astype(str)#.astype('category')
df_test.Product_Category = df_test.Product_Category.astype(str)#.astype('category')

In [118]:
df['Demand_Log'] = np.log(df['Demand']+1)
df_test['Demand_Log'] = np.log(df_test['Demand']+1)

In [119]:
df.drop(['Market_Category', 'Demand'], inplace=True, axis=1)
df_test.drop(['Market_Category', 'Demand'], inplace=True, axis=1)

In [120]:
df.dtypes

State_of_Country            object
Product_Category            object
Grade                        int64
Low_Cap_Price                int64
High_Cap_Price               int64
Year                         int64
Market_Category_Updated     object
Demand_Log                 float64
dtype: object

In [121]:
df_test.dtypes

State_of_Country            object
Product_Category            object
Grade                        int64
High_Cap_Price               int64
Year                         int64
Market_Category_Updated     object
Demand_Log                 float64
dtype: object

In [122]:
categorical = ['State_of_Country', 'Product_Category', 'Market_Category_Updated']
categorical_idx = [0,1,6]

In [123]:
for col in categorical:
    i = df.columns.tolist().index(col)
    enc = OneHotEncoder(categories='auto', sparse = False, handle_unknown='ignore').fit(df.iloc[:,[i]])
    
    temp = pd.DataFrame(enc.transform(df.iloc[:,[i]]))
    df = pd.concat([df, temp], axis = 1)
    
    i_test = df_test.columns.tolist().index(col)
    temp = pd.DataFrame(enc.transform(df_test.iloc[:,[i_test]]))
    df_test = pd.concat([df_test, temp], axis = 1)

In [124]:
df.drop(['State_of_Country', 'Product_Category', 'Market_Category_Updated'], inplace=True, axis=1)
df_test.drop(['State_of_Country', 'Product_Category', 'Market_Category_Updated'], inplace=True, axis=1)

In [125]:
df_Y = df.loc[:,'Low_Cap_Price'].copy()
df_X = df.drop(['Low_Cap_Price'], axis = 1)

In [126]:
from sklearn.decomposition import PCA

In [127]:
pca = PCA(n_components=40, svd_solver='full')

In [128]:
pca.fit(df_X)

PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

In [129]:
df_X = pca.transform(df_X)
df_test = pca.transform(df_test)

In [130]:
from sklearn.model_selection import train_test_split

In [131]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.05, random_state = 0)

In [132]:
import sklearn.metrics as metrics

In [133]:
from sklearn.tree import DecisionTreeRegressor

In [134]:
reg = DecisionTreeRegressor()

In [135]:
reg.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [136]:
preds = reg.predict(X_test)

In [137]:
metrics.r2_score(y_test, preds)

0.7172267315297758

In [138]:
submission_preds = reg.predict(df_test)

In [139]:
submission_tbl = pd.read_csv('Dataset/Test.csv')

In [140]:
submission_tbl

Unnamed: 0,Item_Id,Date,State_of_Country,Market_Category,Product_Category,Grade,Demand,High_Cap_Price
0,IT_265079,2014-01-19,0,3,0,0,0.1,5303
1,IT_265087,2014-01-19,0,268,0,2,1.4,10492
2,IT_265107,2014-01-19,0,320,0,0,13.4,24706
3,IT_265139,2014-01-19,17,358,0,3,5.5,23464
4,IT_265142,2014-01-19,18,23,5,2,0.0,6222
...,...,...,...,...,...,...,...,...
5758,IT_361691,2015-09-29,21,375,10,0,3.4,10925
5759,IT_361694,2015-09-29,21,310,13,0,1.8,9089
5760,IT_361699,2015-09-29,21,445,10,0,0.2,9656
5761,IT_361719,2015-09-29,21,452,13,0,3.4,8070


In [141]:
submission_preds.shape

(5763,)

In [142]:
submission = pd.DataFrame(submission_preds, index=submission_tbl.Item_Id.tolist()).reset_index()

In [143]:
submission.columns = ['Item_Id', 'Low_Cap_Price']

In [144]:
submission

Unnamed: 0,Item_Id,Low_Cap_Price
0,IT_265079,1565.0
1,IT_265087,1229.0
2,IT_265107,2253.0
3,IT_265139,2093.0
4,IT_265142,3460.0
...,...,...
5758,IT_361691,5780.0
5759,IT_361694,4532.0
5760,IT_361699,4851.0
5761,IT_361719,2227.0


In [145]:
submission.to_csv('submission.csv', index=False)