In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pandas_profiling
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('Dataset/Train.csv')
df_test = pd.read_csv('Dataset/Test.csv')

In [3]:
df_test.shape

(5763, 8)

In [4]:
df.drop(['Item_Id'], axis= 1, inplace=True)
df_test.drop(['Item_Id'], axis= 1, inplace=True)

In [5]:
df['Year'] = pd.to_datetime(df.Date).dt.year
df_test['Year'] = pd.to_datetime(df_test.Date).dt.year

In [6]:
df.drop(['Date'], axis= 1, inplace=True)
df_test.drop(['Date'], axis= 1, inplace=True)

In [7]:
top_mktcat = df.Market_Category.value_counts()[:50]

df['Market_Category_Updated'] = df['Market_Category'].apply(lambda x: str(x) if x in top_mktcat else 'others')
df_test['Market_Category_Updated'] = df_test['Market_Category'].apply(lambda x: str(x) if x in top_mktcat else 'others')

In [8]:
df.State_of_Country = df.State_of_Country.astype(str)#.astype('category')
df.Market_Category = df.Market_Category.astype(str)#.astype('category')
df.Product_Category = df.Product_Category.astype(str)#.astype('category')

df_test.State_of_Country = df_test.State_of_Country.astype(str)#.astype('category')
df_test.Market_Category = df_test.Market_Category.astype(str)#.astype('category')
df_test.Product_Category = df_test.Product_Category.astype(str)#.astype('category')

In [9]:
df['Demand_Log'] = np.log(df['Demand']+1)
df_test['Demand_Log'] = np.log(df_test['Demand']+1)

In [10]:
df.drop(['Market_Category', 'Demand'], inplace=True, axis=1)
df_test.drop(['Market_Category', 'Demand'], inplace=True, axis=1)

In [11]:
df.dtypes

State_of_Country            object
Product_Category            object
Grade                        int64
Low_Cap_Price                int64
High_Cap_Price               int64
Year                         int64
Market_Category_Updated     object
Demand_Log                 float64
dtype: object

In [12]:
df_test.dtypes

State_of_Country            object
Product_Category            object
Grade                        int64
High_Cap_Price               int64
Year                         int64
Market_Category_Updated     object
Demand_Log                 float64
dtype: object

In [13]:
categorical = ['State_of_Country', 'Product_Category', 'Market_Category_Updated']

In [14]:
for col in categorical:
    i = df.columns.tolist().index(col)
    enc = OneHotEncoder(categories='auto', sparse = False, handle_unknown='ignore').fit(df.iloc[:,[i]])
    
    temp = pd.DataFrame(enc.transform(df.iloc[:,[i]]))
    df = pd.concat([df, temp], axis = 1)
    
    i_test = df_test.columns.tolist().index(col)
    temp = pd.DataFrame(enc.transform(df_test.iloc[:,[i_test]]))
    df_test = pd.concat([df_test, temp], axis = 1)

In [15]:
df.drop(['State_of_Country', 'Product_Category', 'Market_Category_Updated'], inplace=True, axis=1)
df_test.drop(['State_of_Country', 'Product_Category', 'Market_Category_Updated'], inplace=True, axis=1)

In [16]:
df_Y = df.loc[:,'Low_Cap_Price'].copy()
df_X = df.drop(['Low_Cap_Price'], axis = 1)

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=30, svd_solver='full')

In [19]:
pca.fit(df_X)

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

In [20]:
df_X = pca.transform(df_X)
df_test = pca.transform(df_test)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.25, random_state = 0)

In [23]:
import sklearn.metrics as metrics

In [24]:
from sklearn.tree import DecisionTreeRegressor

In [25]:
def check_model(reg, X_train, y_train, X_test, y_test):
    reg.fit(X_train, y_train)
    preds = reg.predict(X_test)
    print(metrics.r2_score(y_test, preds))

In [26]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
check_model(dt_reg, X_train, y_train, X_test, y_test)

0.71422259594359


In [27]:
from sklearn.linear_model import Ridge

In [28]:
ridge_reg = Ridge()
check_model(ridge_reg, X_train, y_train, X_test, y_test)

0.6698649010035114


In [29]:
from sklearn.linear_model import Lasso

In [30]:
lasso_reg = Lasso()
check_model(lasso_reg, X_train, y_train, X_test, y_test)

0.6699340428751426


In [31]:
from sklearn.svm import SVR
svr_reg = SVR()
check_model(svr_reg, X_train, y_train, X_test, y_test)

0.3323126587879843


In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
cv = cross_val_score(dt_reg, df_X, df_Y, cv=10)
cv.mean()

0.4870788253517541

In [34]:
cv = cross_val_score(ridge_reg, df_X, df_Y, cv=10)
cv.mean()

0.5928947725766527

In [35]:
cv = cross_val_score(lasso_reg, df_X, df_Y, cv=10)
cv.mean()

0.5944992640004457

In [36]:
cv = cross_val_score(svr_reg, df_X, df_Y, cv=10)
cv.mean()

0.19027957326930683

In [52]:
def get_submission(reg, df_X, df_y, df_test, filename):
    reg.fit(df_X, df_y)
    submission_preds = reg.predict(df_test)
    submission_preds = np.around(submission_preds, decimals = 0)
    submission_tbl = pd.read_csv('Dataset/Test.csv')
    submission = pd.DataFrame(submission_preds, index=submission_tbl.Item_Id.tolist()).reset_index()
    submission.columns = ['Item_Id', 'Low_Cap_Price']
    submission.to_csv(filename, index=False)

In [53]:
get_submission(DecisionTreeRegressor(), df_X, df_Y, df_test, 'dt_submission.csv')

In [54]:
get_submission(Ridge(), df_X, df_Y, df_test, 'ridge_submission.csv')

In [55]:
get_submission(Lasso(), df_X, df_Y,df_test, 'lasso_submission.csv')

In [56]:
get_submission(SVR(), df_X, df_Y, df_test, 'svr_submission.csv')