In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import time
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_rows', 500)

In [2]:
train_raw= pd.read_csv('Train.csv',parse_dates=['InvoiceDate'])
test_raw =pd.read_csv('Test.csv',parse_dates=['InvoiceDate'])

In [3]:
train_raw['log_unitprice']=np.log(train_raw['UnitPrice']+1)

In [4]:
combine= train_raw.append(test_raw)

In [5]:
combine.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'log_unitprice'],
      dtype='object')

In [6]:
combine['Quantity']=[i*-1 if i<0 else i for i in combine['Quantity']]

In [7]:
combine['Invoice_encode']=combine['InvoiceNo'].astype('category').cat.codes
combine['CustomerID_encode']=combine['CustomerID'].astype('category').cat.codes
combine['log_quantity']=np.log(combine['Quantity']+1)


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [9]:
train=combine[combine['UnitPrice'].isnull()!=True]
test=combine[combine['UnitPrice'].isnull()==True]
test=test.drop(['UnitPrice','log_unitprice'], axis=1)

In [10]:
train.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'log_unitprice', 'Invoice_encode',
       'CustomerID_encode', 'log_quantity'],
      dtype='object')

In [11]:
test.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'CustomerID', 'Country', 'Invoice_encode', 'CustomerID_encode',
       'log_quantity'],
      dtype='object')

In [12]:
X= train[[ 'StockCode', 'Description', 'Quantity',  'Country', 'Invoice_encode','CustomerID_encode']]
y=train[['UnitPrice']]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=12)

In [13]:
lr=LinearRegression()

In [14]:
lr.fit(X_train,y_train)

LinearRegression()

In [15]:
train_pred=lr.predict(X_train)

In [16]:
print("RMSE of train data:",np.sqrt(mean_squared_error(train_pred,y_train)))

RMSE of train data: 90.67332655387267


#MOdel 2 with Quantity log transformation

In [17]:
X1= train[[ 'StockCode', 'Description','log_quantity' ,  'Country', 'Invoice_encode','CustomerID_encode']]
y1=train[['log_unitprice']]

In [18]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1, test_size=0.3,random_state=12)
lr1=LinearRegression()
lr1.fit(X_train1,y_train1)
train_pred1=lr1.predict(X_train1)
print("RMSE of train data:",np.sqrt(mean_squared_error(train_pred1,y_train1)))

RMSE of train data: 0.5514757510200429


MODEL 3

In [19]:
X2= train[[ 'StockCode', 'Description','log_quantity' , 'Invoice_encode','CustomerID_encode']]
y2=train[['log_unitprice']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, test_size=0.3,random_state=12)
lr2=LinearRegression()
lr2.fit(X_train2,y_train2)
train_pred2=lr2.predict(X_train2)
print("RMSE of train data:",np.sqrt(mean_squared_error(train_pred2,y_train2)))

RMSE of train data: 0.5538283449608424


model 4

In [20]:
X3= train[[ 'log_quantity' , 'Invoice_encode','CustomerID_encode']]
y3=train[['log_unitprice']]
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y3, test_size=0.3,random_state=12)
lr3=LinearRegression()
lr3.fit(X_train3,y_train3)
train_pred3=lr3.predict(X_train3)
print("RMSE of train data:",np.sqrt(mean_squared_error(train_pred3,y_train3)))

RMSE of train data: 0.5576370556894406


In [21]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

In [24]:
cb=CatBoostRegressor(iterations=1000,
                           loss_function='RMSE',
                           depth=6,
                           cat_features=['StockCode', 'Description','Invoice_encode'],
                           
                           leaf_estimation_iterations=10,
                          
                           logging_level='Silent',
                           random_seed=42
                          )

In [None]:
cb.fit(X_train1,y_train1)
cb_pred1=cb.predict(X_train1)
print("RMSE of train data:",np.sqrt(mean_squared_error(cb_pred1,y_train1)))

In [24]:
testdata= test[[ 'StockCode', 'Description','log_quantity' ,   'CustomerID_encode']]
result=cb.predict(testdata)

In [25]:
result=np.exp(result)

In [28]:
df=pd.DataFrame(result)
df=df.round(0)
df=df.astype(int)


Unnamed: 0,0
0,2
1,2
2,5
3,2
4,9
...,...
122044,2
122045,3
122046,2
122047,4


In [29]:
df.to_csv('result.csv')

In [None]:
cb.fit(X_train2,y_train2)
cb_pred2=cb.predict(X_train2)
print("RMSE of train data:",np.sqrt(mean_squared_error(cb_pred2,y_train2)))

In [None]:
cb.fit(X_train3,y_train3)
cb_pred3=cb.predict(X_train3)
print("RMSE of train data:",np.sqrt(mean_squared_error(cb_pred3,y_train3)))