In [None]:
 from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install -q catboost
!pip install -q --upgrade seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.simplefilter('ignore')

In [None]:
Sub = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Sample Submission.csv')
train = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Train.csv')
test = pd.read_csv('/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Test.csv')

In [None]:
train.head()
train.info()

In [None]:
train.nunique()

In [None]:
test.head()
test.info()

In [None]:
train.isnull().sum()
test.isnull().sum()

In [None]:
train.describe()
test.describe()

In [None]:
train['InvoiceDate'] = pd.to_datetime(train['InvoiceDate'])
test['InvoiceDate'] = pd.to_datetime(test['InvoiceDate'])

In [None]:
# Converting dates into individual types 

# Extracting Date features

def extract_time_features(df):
    df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])
    df['year'] = df['InvoiceDate'].dt.year
    df['month'] = df['InvoiceDate'].dt.month
    df['day_of_week'] = df['InvoiceDate'].dt.dayofweek
    df['Day'] = df['InvoiceDate'].dt.day
    df['DayOfyear'] = df['InvoiceDate'].dt.dayofyear
    df['weekofyear'] = pd.to_datetime(df['InvoiceDate']).dt.weekofyear
    df['weekend'] = (df['InvoiceDate'].dt.weekday >=5).astype(int)
    df['hour'] = df['InvoiceDate'].dt.hour
    df['minute'] = df['InvoiceDate'].dt.minute
    del df['InvoiceDate']
    return df

In [None]:
train = extract_time_features(train)
test = extract_time_features(test)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()
test.head()

In [None]:
train.info()

In [None]:
# Correlation Coefficient Matrix => Train Dataset

corr = train.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr, cmap = 'YlGnBu', annot = True, linewidths = 0.5);

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(palette = 'BrBG', data = train);

In [None]:
plt.figure(figsize = (20,15))
sns.boxplot(data = train);

In [None]:
# drop columns array
drop_col_array = ['InvoiceNo' ]
print(drop_col_array)

In [None]:
# drop InvoiceNo

train = train.drop(drop_col_array, axis=1)
test = test.drop(drop_col_array, axis=1)
print(train.shape, test.shape)

In [None]:
# Splitting training dataset into train and test
X = train.copy().drop(['UnitPrice'], axis=1).values
y = train['UnitPrice']

In [None]:
test = test.copy().values

In [None]:
print(train.shape)
print(X.shape)
print(y.shape)

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [None]:
# Feature Scaling
sc = StandardScaler()
sc_fit = sc.fit(X_train)
X_train = sc_fit.transform(X_train)
X_test = sc_fit.transform(X_test)

test_v = sc.transform(test)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
#  Linear Regression Model 

lr = LinearRegression()
lr_fit = lr.fit(X_train, y_train)

# Predictions

y_lr_pred_test = lr_fit.predict(X_test)
y_lr_pred_train = lr_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_lr_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_lr_pred_train))) #Train RMSE 

In [None]:
#  Decision Tree Model 

dt = DecisionTreeRegressor()
dt_fit = dt.fit(X_train, y_train)

# Predictions

y_dt_pred_test = dt_fit.predict(X_test)
y_dt_pred_train = dt_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_dt_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_dt_pred_train))) #Train RMSE 

In [None]:
#  KNN Model 

knn = KNeighborsRegressor()
knn_fit = knn.fit(X_train, y_train)

# Predictions

y_knn_pred_test = knn_fit.predict(X_test)
y_knn_pred_train = knn_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_knn_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_knn_pred_train))) #Train RMSE 

In [None]:
#  Random Forest Model 

rf = RandomForestRegressor()
rf_fit = rf.fit(X_train, y_train)

# Predictions

y_rf_pred_test = rf_fit.predict(X_test)
y_rf_pred_train = rf_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_rf_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_rf_pred_train))) #Train RMSE 

In [None]:
#  AdaBoost Model 

ada = AdaBoostRegressor()
ada_fit = ada.fit(X_train, y_train)

# Predictions

y_ada_pred_test = ada_fit.predict(X_test)
y_ada_pred_train = ada_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_ada_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_ada_pred_train))) #Train RMSE 

In [None]:
#  GBM  Model 

gbm = GradientBoostingRegressor()
gbm_fit = gbm.fit(X_train, y_train)

# Predictions

y_gbm_pred_test = gbm_fit.predict(X_test)
y_gbm_pred_train = gbm_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_gbm_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_gbm_pred_train))) #Train RMSE 

In [None]:
#  MLPR  Model 

mlp = MLPRegressor()
mlp_fit = mlp.fit(X_train, y_train)

# Predictions

y_mlp_pred_test = mlp_fit.predict(X_test)
y_mlp_pred_train = mlp_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_mlp_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_mlp_pred_train))) #Train RMSE 

In [None]:
#  XBGR  Model 

xgb = XGBRegressor()
xgb_fit = xgb.fit(X_train, y_train)

# Predictions

y_xgb_pred_test = xgb_fit.predict(X_test)
y_xgb_pred_train = xgb_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_xgb_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_xgb_pred_train))) #Train RMSE 

In [None]:
#  LGBMR  Model 

lgb = LGBMRegressor()
lgb_fit = lgb.fit(X_train, y_train)

# Predictions

y_lgb_pred_test = lgb_fit.predict(X_test)
y_lgb_pred_train = lgb_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_lgb_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_lgb_pred_train))) #Train RMSE 

In [None]:
#  CatBoost  Model 

cat = CatBoostRegressor()
cat_fit = cat.fit(X_train, y_train)

# Predictions

y_cat_pred_test = cat_fit.predict(X_test)
y_cat_pred_train = cat_fit.predict(X_train)


print('Test RMSE:', sqrt(mean_squared_error(y_test, y_cat_pred_test))) #Test RMSE
print('Train RMSE:', sqrt(mean_squared_error(y_train, y_cat_pred_train))) #Train RMSE 

In [None]:
# Linear Regression File Submission

test_pred = lr_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/LR.csv'
subm.to_csv(filename, index=False)

In [None]:
# Decision Tree File Submission

test_pred = dt_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/DT.csv'
subm.to_csv(filename, index=False)

In [None]:
# KNN File Submission

test_pred = knn_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/KNN.csv'
subm.to_csv(filename, index=False)

In [None]:
# Random Forest File Submission

test_pred = rf_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/RandomForest.csv'
subm.to_csv(filename, index=False)

In [None]:
# AdaBoost File Submission

test_pred = ada_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/Adaboost.csv'
subm.to_csv(filename, index=False)

In [None]:
# GBM File Submission

test_pred = gbm_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/GBM.csv'
subm.to_csv(filename, index=False)

In [None]:
# MLP File Submission

test_pred = mlp_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/MLP.csv'
subm.to_csv(filename, index=False)

In [None]:
# XGBM File Submission

test_pred = xgb_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/XGB.csv'
subm.to_csv(filename, index=False)

In [None]:
# LGBM File Submission

test_pred = lgb_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/LGBM.csv'
subm.to_csv(filename, index=False)

In [None]:
# CatBoost File Submission

test_pred = cat_fit.predict(test_v)

# save results to csv
subm = pd.DataFrame({'UnitPrice': test_pred})
subm = subm[['UnitPrice']]


filename='/content/gdrive/My Drive/Machine Hack - Great Indian Hiring Hackathon/Participants_Data_TGIH/CATBOOST.csv'
subm.to_csv(filename, index=False)