In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')
test_data = pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

# EDA

In [None]:
train_data.dtypes

In [None]:
train_data.info()

In [None]:
# Use the describe() method to get the summary statistics of the target variable
print(train_data['total_amount'].describe())

# Use the hist() method to visualize the distribution of the target variable
import matplotlib.pyplot as plt

plt.style.use('ggplot')
n, bins, patches = plt.hist(train_data['total_amount'], bins=70, density=False)
max_bin = n.argmax()
patches[max_bin].set_facecolor('blue')
class_range = (bins[max_bin], bins[max_bin + 1])
class_range = (round(class_range[0], 2), round(class_range[1], 2))
plt.text(bins[max_bin], n[max_bin], f'Max: {n[max_bin]:.2f}', ha='center', va='bottom')
plt.text(bins[max_bin], n[max_bin], f'Class Range: {class_range}', ha='left', va='top')
plt.xlabel('Total Amount')
plt.ylabel('Frequency')
plt.title("Histogram for total amount")
plt.show() #show the peak value



**Light data preparation**

In [None]:
 train_data['tpep_pickup_datetime'] = pd.to_datetime(train_data['tpep_pickup_datetime'])
 train_data['pickup_year'] =  train_data['tpep_pickup_datetime'].dt.year
 train_data['pickup_month'] =  train_data['tpep_pickup_datetime'].dt.month
 train_data['pickup_day_of_week'] =  train_data['tpep_pickup_datetime'].dt.dayofweek
 train_data['pickup_day'] =  train_data['tpep_pickup_datetime'].dt.day
 train_data['pickup_hour'] =  train_data['tpep_pickup_datetime'].dt.hour
 train_data['pickup_minute'] =  train_data['tpep_pickup_datetime'].dt.minute

 train_data['tpep_dropoff_datetime'] = pd.to_datetime(train_data['tpep_dropoff_datetime'])
 train_data['dropoff_year'] =  train_data['tpep_dropoff_datetime'].dt.year
 train_data['dropoff_month'] =  train_data['tpep_dropoff_datetime'].dt.month
 train_data['dropoff_day_of_week'] =  train_data['tpep_dropoff_datetime'].dt.dayofweek
 train_data['dropoff_day'] =  train_data['tpep_dropoff_datetime'].dt.day
 train_data['dropoff_hour'] =  train_data['tpep_dropoff_datetime'].dt.hour
 train_data['dropoff_minute'] =  train_data['tpep_dropoff_datetime'].dt.minute

 train_data =  train_data.drop(['tpep_pickup_datetime'], axis=1)
 train_data =  train_data.drop(['tpep_dropoff_datetime'], axis=1)

column_order = [col for col in train_data.columns if col != "total_amount"] + ["total_amount"]
train_data = train_data[column_order]


In [None]:
for i in list(train_data.columns):
    print(i, "->", train_data[i].nunique(),"unique values")

**Is Payment Type Necessary?**

In [None]:
train_data_test = train_data.dropna(subset=['payment_type'])

import matplotlib.pyplot as plt

plt.scatter(train_data_test['payment_type'], train_data_test['total_amount'])
plt.xlabel("Payment Type")
plt.ylabel("Total Amount")
plt.show()

It looks like payment type is not an important variable. So I'll drop it.

In [None]:
train_data = train_data.drop('payment_type', axis=1)

**What's store_and_fwd_flag**

*The store_and_fwd_flag column indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server*

In [None]:
train_data_test = train_data.dropna(subset=['store_and_fwd_flag'])

import matplotlib.pyplot as plt

plt.scatter(train_data_test['store_and_fwd_flag'], train_data_test['total_amount'])
plt.xlabel("Store and Fwd Flag")
plt.ylabel("Total Amount")
plt.show()

There seems to be some influence of this variable on Total Amount

**Some more data preprocessing**

In [None]:
categorical_list = []
numerical_list = []

for i in list(train_data.columns):
    if train_data[i].dtype == "object":
        categorical_list.append(i)
    else:
        numerical_list.append(i)
        
print("Categorical variables:", categorical_list)
print("Numerical variables:", numerical_list)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

categorical_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

numerical_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="mean"))
])

ct = ColumnTransformer([
    ('cat_pipe', categorical_pipe, categorical_list),
    ("num_pipe", numerical_pipe, numerical_list)
])


ct.fit(train_data)
transformed_data = ct.transform(train_data)

columns = list(ct.transformers_[0][1]['ohe'].get_feature_names_out(categorical_list)) + numerical_list

transformed_df = pd.DataFrame(transformed_data, columns=columns)



**Finding Correlations**

In [None]:
correlation_matrix = transformed_df.corr()

correlation_with_total_amount = correlation_matrix["total_amount"]
print(correlation_with_total_amount.sort_values(ascending=False))

mask = np.triu(correlation_matrix)
import seaborn as sns
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.1f', square=True, 
            xticklabels=3, yticklabels=3, mask=mask)
plt.title('Correlation Heat Map')
plt.show()

**Dropping some columns**

In [None]:
transformed_df = transformed_df.drop(['pickup_month','pickup_year',
                         'dropoff_month', 'dropoff_year',
                         'pickup_minute','dropoff_minute',
                        'pickup_day','dropoff_day'], axis=1)

*Now let's make a cluster map*

In [None]:
correlation_matrix = transformed_df.corr()

sns.clustermap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.2f', xticklabels=3, yticklabels=3)
plt.title('Correlation Cluster Map')

plt.show()

In [None]:
dropped_columns = ['pickup_month','pickup_year',
                         'dropoff_month', 'dropoff_year',
                         'pickup_minute','dropoff_minute',
                        'pickup_day','dropoff_day']

In [None]:
updated_columns = [x for x in columns if (x not in dropped_columns) and (x not in ['total_amount','store_and_fwd_flag_N', 'store_and_fwd_flag_Y'])]#removing total amount too

**Removing outliers**

In [None]:
for i in list(transformed_df.columns):
    print(i, "->", transformed_df[i].nunique(),"unique values")

*Don't wanna find outliers for categorical, ordinal variables, do we now?*

In [None]:
numerical_columns = ['trip_distance','tip_amount', 'tolls_amount']
numerical_columns

In [None]:
transformed_df.head()

In [None]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(n_estimators=100, max_samples=0.5, contamination=0.1, random_state=42)
X = transformed_df.drop('total_amount', axis=1)
X_numerical = transformed_df[numerical_columns].copy()
y = transformed_df['total_amount'].copy()

iforest.fit(X_numerical)

y_pred_train = iforest.predict(X_numerical)

df_no_outliers = X[np.where(y_pred_train == 1, True, False)].copy()
total_amount = y[np.where(y_pred_train == 1, True, False)]
df_no_outliers['total_amount'] = total_amount

**Some more Visualizations**

In [None]:
import matplotlib.pyplot as plt

plt.scatter(df_no_outliers["trip_distance"], total_amount)

plt.xlabel("Trip Distance")
plt.ylabel("Total Amount")
plt.title("Scatterplot of Total Amount and Trip Distance")

plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.scatter(transformed_df["improvement_surcharge"], transformed_df['total_amount'])

plt.xlabel("Improvement_surcharge")
plt.ylabel("Total Amount")
plt.title("Scatterplot of Improvement Surcharge and Airport Fee")

plt.show()

# Data Preparation

**Dealing with Time in test.csv**

In [None]:
 test_data['tpep_pickup_datetime'] = pd.to_datetime(test_data['tpep_pickup_datetime'])
 test_data['pickup_year'] =  test_data['tpep_pickup_datetime'].dt.year
 test_data['pickup_month'] =  test_data['tpep_pickup_datetime'].dt.month
 test_data['pickup_day_of_week'] =  test_data['tpep_pickup_datetime'].dt.dayofweek
 test_data['pickup_day'] =  test_data['tpep_pickup_datetime'].dt.day
 test_data['pickup_hour'] =  test_data['tpep_pickup_datetime'].dt.hour
 test_data['pickup_minute'] =  test_data['tpep_pickup_datetime'].dt.minute

 test_data['tpep_dropoff_datetime'] = pd.to_datetime(test_data['tpep_dropoff_datetime'])
 test_data['dropoff_year'] =  test_data['tpep_dropoff_datetime'].dt.year
 test_data['dropoff_month'] =  test_data['tpep_dropoff_datetime'].dt.month
 test_data['dropoff_day_of_week'] =  test_data['tpep_dropoff_datetime'].dt.dayofweek
 test_data['dropoff_day'] =  test_data['tpep_dropoff_datetime'].dt.day
 test_data['dropoff_hour'] =  test_data['tpep_dropoff_datetime'].dt.hour
 test_data['dropoff_minute'] =  test_data['tpep_dropoff_datetime'].dt.minute

 test_data =  test_data.drop(['tpep_pickup_datetime'], axis=1)
 test_data =  test_data.drop(['tpep_dropoff_datetime'], axis=1)


**Dropping Columns**

In [None]:
test_data_new = test_data.drop('payment_type', axis=1)


In [None]:
train_data_new = train_data.drop(dropped_columns, axis=1)
test_data_new = test_data.drop(dropped_columns, axis=1)

In [None]:
train_data_new.head()

**Train-test Split**

In [None]:
from sklearn.model_selection import train_test_split
X = train_data_new.drop("total_amount", axis=1)
y = train_data_new['total_amount'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Column Transformer**

In [None]:
print("Categorical list:", categorical_list)
print("Numerical list:", updated_columns)

In [None]:
categorical_pipe = Pipeline([("imp",SimpleImputer(strategy="most_frequent")),
                     ("ohe",OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

numerical_pipe = Pipeline([("imp",SimpleImputer(strategy="mean"))])

ct_2 = ColumnTransformer([
                        ('cat_pipe', categorical_pipe, categorical_list),
                       ("num_pipe", numerical_pipe, updated_columns)])



In [None]:
ct_2.fit(X_train)

In [None]:
X_train_trf = ct_2.transform(X_train)
X_test_trf = ct_2.transform(X_test)
test_data_trf = ct_2.transform(test_data)

In [None]:
columns = list(ct_2.transformers_[0][1]['ohe'].get_feature_names_out(categorical_list)) + updated_columns

X_train_trf_df = pd.DataFrame(X_train_trf, columns=columns)
X_test_trf_df = pd.DataFrame(X_test_trf, columns=columns)
test_data_trf_df = pd.DataFrame(test_data_trf, columns=columns)

**Removing Outliers**

In [None]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(n_estimators=100, max_samples=0.5, contamination=0.05, random_state=42)
X_numerical = X_train_trf_df[numerical_columns].copy()
y = transformed_df['total_amount'].copy()

iforest.fit(X_numerical)

y_pred_train = iforest.predict(X_numerical)

X_train_df_no_outliers = X_train_trf_df[np.where(y_pred_train == 1, True, False)]
y_train_final = y_train[np.where(y_pred_train == 1, True, False)]

**Scaling and PCA**

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA


ct_3 = ColumnTransformer([("scaler", RobustScaler(), numerical_columns)])
pipe_2 = Pipeline([
    ("scaler_col_trf", ct_3),
    ("pca", PCA(n_components = 0.95))
])

In [None]:
pipe_2.fit(X_train_df_no_outliers)

In [None]:
X_train_final = pipe_2.transform(X_train_df_no_outliers)
X_test_final = pipe_2.transform(X_test_trf_df)
test_data_final = pipe_2.transform(test_data_trf_df)

**We'll observe how the models perform**

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score

metric = {
    'r2':{
        'train':{},
        'test': {}
    },
    'MSE':{
        'train':{},
        'test': {}
    },
    'CV':{
        'score_mean':{},
        'score_stand_dev':{}
    }
}

For each model:

    R2
    MSE
    CV score

# Dummy Model Submission

In [None]:
from sklearn.dummy import DummyRegressor
model = DummyRegressor()
model.fit(X_train_trf, y_train)

**R2 and MSE**

In [None]:
print("R2 score in train and test:", (r2_score(y_train, model.predict(X_train_trf)), 
      r2_score(y_test, model.predict(X_test_trf))))

MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

metric['MSE']['train']['Linear Regression'] = MSE_train
metric['MSE']['test']['Linear Regression'] = MSE_test

**Cross Validation**

In [None]:
cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

**Submission Code**

In [None]:
# y_pred = model.predict( test_data)
# y_pred[:5]
# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred
# submission.to_csv('submission.csv',index=False)

In [None]:
# submission.head()

In [None]:
# import csv
# with open('submission.csv', 'r') as file:
#     csv_reader = csv.reader(file)
#     for line in csv_reader:
#         print(line)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()
model.fit(X_train_trf, y_train)

**R2 Score and MSE**

In [None]:
r2_train = r2_score(y_train, model.predict(X_train_trf))
r2_test = r2_score(y_test, model.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['Linear Regression'] = r2_train
metric['r2']['test']['Linear Regression'] = r2_test

MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

metric['MSE']['train']['Linear Regression'] = MSE_train
metric['MSE']['test']['Linear Regression'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['Linear Regression'] = mean
metric['CV']['score_stand_dev']['Linear Regression'] = stand_dev

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.pipeline import make_pipeline

poly_features = PolynomialFeatures(degree=2)
linear_regression = SGDRegressor()
model = make_pipeline(poly_features, linear_regression)

model.fit(X_train_trf, y_train)


**r2 Score and MSE**

In [None]:
r2_train = r2_score(y_train, model.predict(X_train_trf))
r2_test = r2_score(y_test, model.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

# metric['r2']['train']['Polynomial Regression'] = r2_train
# metric['r2']['test']['Polynomial Regression'] = r2_test

MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

# metric['MSE']['train']['Polynomial Regression'] = MSE_train
# metric['MSE']['test']['Polynomial Regression'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

# metric['CV']['score_mean']['Polynomial Regression'] = mean
# metric['CV']['score_stand_dev']['Polynomial Regression'] = stand_dev

# KNN as a Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

k = 3
model = KNeighborsRegressor(n_neighbors=k)
model.fit(X_train_trf, y_train)

**R2 score and MSE**

In [None]:
r2_train = r2_score(y_train, model.predict(X_train_trf))
r2_test = r2_score(y_test, model.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['KNN Regression'] = r2_train
metric['r2']['test']['KNN Regression'] = r2_test

MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

metric['MSE']['train']['KNN Regression'] = MSE_train
metric['MSE']['test']['KNN Regression'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['KNN Regression'] = mean
metric['CV']['score_stand_dev']['KNN Regression'] = stand_dev

**Grid Search CV**

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {'n_neighbors': [1,2, 3, 5, 7, 9]}
# grid_search = GridSearchCV(model, param_grid, cv=5)

# grid_search.fit(X_train_trf, y_train)

# best_k = grid_search.best_params_['n_neighbors']

# print("Best k for KNN", best_k)

# Support Vector Regression

In [None]:
# from sklearn.svm import SVR
# model = SVR(kernel='linear')
# model.fit(X_train_trf, y_train)

**r2 score and MSE**

In [None]:
# r2_train = r2_score(y_train, model.predict(X_train_trf))
# r2_test = r2_score(y_test, model.predict(X_test_trf))
# print("R2 score on train and test", (r2_train, r2_test))

# metric['r2']['train']['SVRegression'] = r2_train
# metric['r2']['test']['SvRegression'] = r2_test

# MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
# MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

# metric['MSE']['train']['SVRegression'] = r2_train
# metric['MSE']['test']['SVRegression'] = r2_test

# print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
# cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
# mean = cv_scores.mean()
# stand_dev = cv_scores.std()

# print("Score mean:", mean)
# print("Score variance:", stand_dev)

# metric['CV']['score_mean']['SVRegression'] = mean
# metric['CV']['score_stand_dev']['SVRegression'] = stand_dev

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=3)
model.fit(X_train_trf,y_train)


**r2 score and MSE**

In [None]:
r2_train = r2_score(y_train, model.predict(X_train_trf))
r2_test = r2_score(y_test, model.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['Decision Tree'] = r2_train
metric['r2']['test']['Decision Tree'] = r2_test

MSE_train = mean_squared_error(y_train, model.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, model.predict(X_test_trf))

# metric['MSE']['train']['Decision Tree'] = MSE_train
# metric['MSE']['test']['Decision Tree'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(model, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['Decision Tree'] = mean
metric['CV']['score_stand_dev']['Decision Tree'] = stand_dev

**Grid Search CV**

In [None]:
# from sklearn.model_selection import GridSearchCV

# rgr = DecisionTreeRegressor(random_state=3)

# param_grid = {
#     "max_depth": [2, 4, 6, 8, 10],
#     "min_samples_split": [2, 5, 10, 20],
#     "criterion": ["squared_error", "absolute_error"],
#     "max_features": [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
# }

# grid_search = GridSearchCV(rgr, param_grid, cv=5, scoring="r2")

# # fit the grid search on the training data
# grid_search.fit(X_train_trf, y_train)

# # print the best parameters and score
# print("Best parameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)


In [None]:
# r2_score(y_train, grid_search.best_estimator_.predict(X_train_trf)), r2_score(y_test, grid_search.best_estimator_.predict(X_test_trf))

# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators = 100, criterion = "squared_error", max_depth = 10, random_state = 42)
rfr.fit(X_train_trf, y_train)


**R2 and MSE**

In [None]:
r2_train = r2_score(y_train, rfr.predict(X_train_trf))
r2_test = r2_score(y_test, rfr.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['Random Forest'] = r2_train
metric['r2']['test']['Random Forest'] = r2_test

MSE_train = mean_squared_error(y_train, rfr.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, rfr.predict(X_test_trf))

metric['MSE']['train']['Random Forest'] = MSE_train
metric['MSE']['test']['Random Forest'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(rfr, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['Random Forest'] = mean
metric['CV']['score_stand_dev']['Random Forest'] = stand_dev

**Submission**

In [None]:
# y_pred = rfr.predict(test_data_trf)

# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred

# submission.to_csv('submission.csv',index=False)

# Bagging Regressor

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score

bagging = BaggingRegressor(random_state=42)

bagging.fit(X_train_trf, y_train)

**R2 and MSE**

In [None]:
r2_train = r2_score(y_train, bagging.predict(X_train_trf))
r2_test = r2_score(y_test, bagging.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['Bagging Regressor'] = r2_train
metric['r2']['test']['Bagging Regressor'] = r2_test

MSE_train = mean_squared_error(y_train, bagging.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, bagging.predict(X_test_trf))

metric['MSE']['train']['Bagging Regressor'] = MSE_train
metric['MSE']['test']['Bagging Regressor'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validate**

In [None]:
cv_scores = cross_val_score(bagging, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['Bagging Regressor'] = mean
metric['CV']['score_stand_dev']['Bagging Regressor'] = stand_dev

**Submission**

In [None]:
# y_pred = bagging.predict(test_data_trf)

# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred

# submission.to_csv('submission.csv',index=False)

# Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gradient_boost.fit(X_train_trf, y_train)

**r2 score and MSE**

In [None]:
r2_train = r2_score(y_train, gradient_boost.predict(X_train_trf))
r2_test = r2_score(y_test, gradient_boost.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['Gradient Boost'] = r2_train
metric['r2']['test']['Gradient Boost'] = r2_test

MSE_train = mean_squared_error(y_train, gradient_boost.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, gradient_boost.predict(X_test_trf))

metric['MSE']['train']['Gradient Boost'] = MSE_train
metric['MSE']['test']['Gradient Boost'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(gradient_boost, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['Gradient Boost'] = mean
metric['CV']['score_stand_dev']['Gradient Boost'] = stand_dev

**Submission Code**

In [None]:
# y_pred = gradient_boost.predict(test_data_trf)

# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred

# submission.to_csv('submission.csv',index=False)

# XGBoost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train_trf, y_train)

**r2 score and MSE**

In [None]:
r2_train = r2_score(y_train, xgb.predict(X_train_trf))
r2_test = r2_score(y_test, xgb.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

metric['r2']['train']['XGBoost'] = r2_train
metric['r2']['test']['XGBoost'] = r2_test

MSE_train = mean_squared_error(y_train, xgb.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, xgb.predict(X_test_trf))

metric['MSE']['train']['XGBoost'] = MSE_train
metric['MSE']['test']['XGBoost'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

(0.9775557906761233, 0.9596372533370908)

**Cross Validation**

In [None]:
cv_scores = cross_val_score(xgb, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

metric['CV']['score_mean']['XGBoost'] = mean
metric['CV']['score_stand_dev']['XGBoost'] = stand_dev

**GridSearchCV**

In [None]:
# from sklearn.model_selection import GridSearchCV 

# param_grid = {
#     'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
#     'max_depth': [3, 4, 5,6,7,8],
#     "min_child_weight": [1, 3, 5, 7],
#     "gamma": [0.0, 0.1, 0.2]
# }

# grid_search = GridSearchCV(xgb, param_grid, cv=4, scoring='r2', verbose=1)

# grid_search.fit(X_train_trf, y_train)

In [None]:
# r2_score(y_train, grid_search.best_estimator_.predict(X_train_trf)), r2_score(y_test, grid_search.best_estimator_.predict(X_test_trf))

**Submission**

In [None]:
# y_pred = xgb.predict(test_data_trf)

# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred

# submission.to_csv('submission.csv',index=False)

# Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPRegressor
perceptron_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

perceptron_regressor.fit(X_train_trf, y_train)

**r2 score and MSE**

In [None]:
r2_train = r2_score(y_train, perceptron_regressor.predict(X_train_trf))
r2_test = r2_score(y_test, perceptron_regressor.predict(X_test_trf))
print("R2 score on train and test", (r2_train, r2_test))

# metric['r2']['train']['Multi-Layer Perceptron'] = r2_train
# metric['r2']['test']['Multi-Layer Perceptron'] = r2_test

MSE_train = mean_squared_error(y_train, perceptron_regressor.predict(X_train_trf))
MSE_test = mean_squared_error(y_test, perceptron_regressor.predict(X_test_trf))

metric['MSE']['train']['Multi-Layer Perceptron'] = MSE_train
metric['MSE']['test']['Multi-Layer Perceptron'] = MSE_test

print("MSE score on train and test", (MSE_train, MSE_test))

**Cross Validation**

In [None]:
cv_scores = cross_val_score(perceptron_regressor, cv=4, X=X_train_trf, y=y_train, scoring="r2")
mean = cv_scores.mean()
stand_dev = cv_scores.std()

print("Score mean:", mean)
print("Score variance:", stand_dev)

# metric['CV']['score_mean']['Multi-Layer Perceptron'] = mean
# metric['CV']['score_stand_dev']['Multi-Layer Perceptron'] = stand_dev

**Submission**

In [None]:
# y_pred = perceptron_regressor.predict(test_data_trf)

# submission = pd.DataFrame(columns = ["ID","total_amount"])
# submission["ID"] = [i for i in range(1,len(y_pred)+1)]
# submission["total_amount"] = y_pred

# submission.to_csv('submission.csv',index=False)

# Comparing the Models

**R2 Scores**

*R2 train scores*

In [None]:
models = list(metric['r2']['train'].keys())
train_scores = list(metric["r2"]["train"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("R2 train scores")
plt.title("R2 train scores for different models")
plt.show()

*R2 test scores*

In [None]:
models = list(metric['r2']['test'].keys())
train_scores = list(metric["r2"]["test"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("R2 test scores")
plt.title("Scatter plot of R2 test scores for different models")
plt.show()

**MSE Scores**

*MSE train scores*

In [None]:
models = list(metric['MSE']['train'].keys())
train_scores = list(metric["r2"]["train"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("MSE train scores")
plt.title("MSE train scores for different models")
plt.show()

*MSE Test Scores*

In [None]:
models = list(metric['MSE']['test'].keys())
train_scores = list(metric["MSE"]["test"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("MSE test scores")
plt.title("MSE test scores for different models")
plt.show()

**Cross Validation Scores**


*Mean Scores*

In [None]:
models = list(metric['CV']['score_mean'].keys())
train_scores = list(metric["CV"]["score_mean"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("CV Scores Means")
plt.title("Means of CV scores for different models")
plt.show()

*Standard Deviation of CV Scores*

In [None]:
models = list(metric['CV']['score_stand_dev'].keys())
train_scores = list(metric["CV"]["score_stand_dev"].values())
models = [model.replace(" ", "\n") for model in models]

plt.scatter(models, train_scores)
plt.xlabel("Models")
plt.ylabel("CV Scores Standard Deviations")
plt.title("Standard Deviations of CV scores for different models")
plt.show()

# Final Submission

In [None]:
y_pred = xgb.predict(test_data_trf)

submission = pd.DataFrame(columns = ["ID","total_amount"])
submission["ID"] = [i for i in range(1,len(y_pred)+1)]
submission["total_amount"] = y_pred

submission.to_csv('submission.csv',index=False)