In [None]:
 from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install -q catboost
!pip install -q --upgrade seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.simplefilter('ignore')

In [None]:
Sub = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Sample Submission.csv')
train = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Train.csv')
test = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Test.csv')

In [None]:
train.head()
train.info()

In [None]:
train.nunique()

In [None]:
test.head()
test.info()

In [None]:
train.isnull().sum()
test.isnull().sum()

In [None]:
train.describe()
test.describe()

In [None]:
train.hist(figsize=(12,8));

In [None]:
test.hist(figsize=(10,13));

In [None]:
#Finding out duplicate rows

train_row_dup = train.duplicated() 
print('Duplicate rows in train dataset : ', train_row_dup.sum())


# test_row_dup = test.duplicated() 
# print('Duplicate rows in test dataset : ', test_row_dup.sum())

In [None]:
# Dropping the duplicate rows on both train and test dataset 

train.drop_duplicates(inplace=True)
# test.drop_duplicates(inplace=True)

In [None]:
train['InvoiceDate'] = pd.to_datetime(train['InvoiceDate'])
test['InvoiceDate'] = pd.to_datetime(test['InvoiceDate'])

In [None]:
# Converting dates into individual types 

# Extracting Date features

def extract_time_features(df):
    df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])
    df['year'] = df['InvoiceDate'].dt.year
    df['month'] = df['InvoiceDate'].dt.month
    df['day_of_week'] = df['InvoiceDate'].dt.dayofweek
    df['Day'] = df['InvoiceDate'].dt.day
    df['DayOfyear'] = df['InvoiceDate'].dt.dayofyear
    df['Week'] = pd.to_datetime(df['InvoiceDate']).dt.week
    df['Quarter'] = pd.to_datetime(df['InvoiceDate']).dt.quarter 
    df['hour'] = df['InvoiceDate'].dt.hour
    df['minute'] = df['InvoiceDate'].dt.minute
    del df['InvoiceDate']
    return df

In [None]:
train = extract_time_features(train)
test = extract_time_features(test)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()
test.head()

In [None]:
# Correlation Coefficient Matrix => Train Dataset

corr = train.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr, cmap = 'YlGnBu', annot = True, linewidths = 0.5);

In [None]:
plt.figure(figsize = (15,8))
sns.barplot(palette = 'BrBG', data = train);

##Identifying outliers and removing it 

In [None]:
#Train dataset

cols=['StockCode', 'Description', 'Quantity',
       'UnitPrice', 'CustomerID', 'Country']

plt.figure(figsize=[25,15])
for i in range(len(cols)):
    plt.subplot(3,3,i+1)
    sns.boxplot(train[cols[i]])

In [None]:
#Test dataset

cols1=['StockCode', 'Description', 'Quantity', 
       'CustomerID', 'Country']

plt.figure(figsize=[25,15])
for j in range(len(cols1)):
       plt.subplot(3,3,j+1)
       sns.boxplot(test[cols1[j]])

In [None]:
#Dropping the outliers

def remove_outlier(cols):
    sorted(cols)
    Q1,Q3 = cols.quantile([0.25,0.75])
    IQR = Q3-Q1
    lower_range = Q1-(1.5 * IQR)
    upper_range = Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
#Train Dataset

lrQuantity,urQuantity = remove_outlier(train['Quantity'])
train['Quantity'] = np.where(train['Quantity']>urQuantity,urQuantity, train['Quantity'])
train['Quantity'] = np.where(train['Quantity']<lrQuantity,lrQuantity, train['Quantity'])


lrUnitPrice,urUnitPrice = remove_outlier(train['UnitPrice'])
train['UnitPrice'] = np.where(train['UnitPrice']>urUnitPrice,urUnitPrice, train['UnitPrice'])
train['UnitPrice'] = np.where(train['UnitPrice']<lrUnitPrice,lrUnitPrice, train['UnitPrice'])

lrCountry,urCountry = remove_outlier(train['Country'])
train['Country'] = np.where(train['Country']>urCountry,urCountry, train['Country'])
train['Country'] = np.where(train['Country']<lrCountry,lrCountry, train['Country'])


train.shape


In [None]:
#Test Dataset

lrQuantity,urQuantity = remove_outlier(test['Quantity'])
test['Quantity'] = np.where(test['Quantity']>urQuantity, urQuantity, test['Quantity'])
test['Quantity'] = np.where(test['Quantity']<lrQuantity, lrQuantity, test['Quantity'])


lrCountry,urCountry = remove_outlier(test['Country'])
test['Country'] = np.where(test['Country']>urCountry,urCountry,test['Country'])
test['Country'] = np.where(test['Country']<lrCountry,lrCountry,test['Country'])


test.shape


In [None]:
plt.figure(figsize=[25,15])
for i in range(len(cols)):
    plt.subplot(3,3,i+1)
    sns.boxplot(train[cols[i]])

In [None]:
sns.distplot(train['UnitPrice']);

**The Target Variable is highly skewed. So we are going to normalize it by applying log transformation through np.log1p**

In [None]:
train['UnitPrice'] = np.log1p(train['UnitPrice'])

sns.distplot(train['UnitPrice']);

In [None]:
# Splitting training dataset into train and test
X = train.drop(['UnitPrice', 'InvoiceNo'], axis=1)
y = train['UnitPrice']
test_v = test.drop(['InvoiceNo'], axis = 1)

In [None]:
print(test_v.shape)
print(X.shape)
print(y.shape)

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1253631)

In [None]:
# Feature Scaling

sc = StandardScaler()

sc_fit = sc.fit(X_train)
X_train = sc_fit.transform(X_train)
X_test = sc_fit.transform(X_test)

test_v = sc_fit.transform(test_v)

In [None]:
# Importing GridSearch and RandomSearch

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
# Different parameters we want to test

params = {'learning_rate' : [0.8, 0.9, 1, 1.2, 1.4],
          'n_estimators' : [400, 500, 600, 700]}


ada = AdaBoostRegressor(random_state=10)

In [None]:
folds = 5
param_comb = 10

kf = KFold(n_splits = folds, shuffle = True, random_state = 1231)

random_search = RandomizedSearchCV(ada, param_distributions = params, n_iter = param_comb, scoring = 'neg_mean_squared_error', n_jobs = 4,
                                   cv = kf.split(X_train, y_train), verbose=3, random_state=1231 )

# Here we go

random_search.fit(X_train, y_train)

In [None]:
random_search.best_score_
random_search.best_params_
random_search.best_estimator_

In [None]:
ada = AdaBoostRegressor(learning_rate = 1.2, n_estimators = 400, random_state = 1110)

ada_fit = ada.fit(X_train, y_train)


y_ada_pred_test = ada_fit.predict(X_test)
y_ada_pred_train = ada_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_ada_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_ada_pred_train))) #Train RMSE 

In [None]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import AdaBoostRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
#  Random Forest Model 

rf = RandomForestRegressor()
rf_fit = rf.fit(X_train, y_train)

# Predictions

y_rf_pred_test = rf_fit.predict(X_test)
y_rf_pred_train = rf_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_rf_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_rf_pred_train))) #Train RMSE 

In [None]:
#  XBGR  Model 

xgb = XGBRegressor(learning_rate = 0.01, max_depth = 6, colsample_bytree = 0.8, seed = 100)
xgb_fit = xgb.fit(X_train, y_train)

# Predictions

y_xgb_pred_test = xgb_fit.predict(X_test)
y_xgb_pred_train = xgb_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_xgb_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_xgb_pred_train))) #Train RMSE 

In [None]:
#  LGBMR  Model 

# lgb_params = {'boosting_type': 'gbdt',
#  'objective': 'regression',
#  'metric': 'rmse',
#  'verbose': 0,
#  'bagging_fraction': 0.8,
#  'bagging_freq': 1,
#  'lambda_l1': 0.01,
#  'lambda_l2': 0.01,
#  'learning_rate': 0.01,
#  'max_bin': 255,
#  'max_depth': 6,
#  'min_data_in_bin': 1,
#  'min_data_in_leaf': 1,
#  'num_leaves': 31}

# lgb = LGBMRegressor(n_estimators=10000, **lgb_params, random_state=123456789, n_jobs=-1)

lgb = LGBMRegressor()
lgb_fit = lgb.fit(X_train, y_train)

# Predictions

y_lgb_pred_test = lgb_fit.predict(X_test)
y_lgb_pred_train = lgb_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_lgb_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_lgb_pred_train))) #Train RMSE 

In [None]:
#  CatBoost  Model 

cat = CatBoostRegressor()
cat_fit = cat.fit(X_train, y_train)

# Predictions

y_cat_pred_test = cat_fit.predict(X_test)
y_cat_pred_train = cat_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_cat_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_cat_pred_train))) #Train RMSE 

In [None]:
# Random Forest File Submission

test_pred = rf_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/RandomForest.csv'
subm.to_csv(filename, index=False)

In [None]:
# XGBM File Submission

test_pred = xgb_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/XGB.csv'
subm.to_csv(filename, index=False)

In [None]:
# LGBM File Submission

test_pred = lgb_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/LGBM.csv'
subm.to_csv(filename, index=False)

In [None]:
# CatBoost File Submission

test_pred = cat_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/CATBOOST.csv'
subm.to_csv(filename, index=False)