## Importing relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings as w
w.filterwarnings(action='ignore')

## Loading the datasets

In [None]:
train_df = pd.read_csv('train_F3fUq2S.csv')
test_df = pd.read_csv('test_Bk2wfZ3.csv')

## Inspecting the datasets

In [None]:
train_df.shape, test_df.shape

#### * __We have 1888 rows and 22 columns in Train set whereas Test set has 762 rows and 21 columns.__

In [None]:
# Ratio of null values of training set
train_df.isnull().sum()/train_df.shape[0]*100

In [None]:
# Ratio of null values of testing set
test_df.isnull().sum()/test_df.shape[0]*100

In [None]:
train_df.info()

In [None]:
test_df.info()

### Remark
* All the features have either int or float datatype except 'times_of_day' feature, which has object datatype
* Both training and testing dataset does not have any null values 


## EDA

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

In [None]:
train_df.columns


In [None]:
test_df.columns

In [None]:
train_df.drop_duplicates().shape

In [None]:
test_df.drop_duplicates().shape

In [None]:

plt.hist(train_df['click_rate'])
plt.title("Histogram of original target", size=17)
plt.xlabel("Click rate", size=13)
plt.ylabel("Frequencies", size=13)
plt.show()

### Changing the datatype of 'times_of_day' to categorical for error free Visualization

In [None]:
train_df['times_of_day'].replace({'Morning':1, 'Noon':2, 'Evening':3}, inplace= True)
test_df['times_of_day'].replace({'Morning':1, 'Noon':2, 'Evening':3}, inplace= True)

In [None]:
train_df.head(2)

## Visualisation of the data set

### Plotting Histogram of each features

In [None]:
n=1
plt.figure(figsize=(18,15))
for column in train_df.columns:
    
    plt.subplot(5,4,n)
    n=n+1
    sns.distplot(train_df[column], hist=True, kde=True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 1})
    plt.tight_layout


### Remark
* From the above plots it seems that features given below should be analyzed deeply
1. is_price
2. is_timer
3. is_discount
4. is_personalized
5. is_urgency
6. is_emoticons 


### is_price


In [None]:
train_df['is_price'].value_counts()

In [None]:
plt.hist(train_df['is_price'])
plt.show()

In [None]:
test_df['is_price'].value_counts()

In [None]:
plt.hist(test_df['is_price'])
plt.show()

##

### is_timer

In [None]:
train_df['is_timer'].value_counts()

In [None]:
test_df['is_timer'].value_counts()

### is_discount

In [None]:
train_df['is_discount'].value_counts()

In [None]:
test_df['is_discount'].value_counts()

### is_personalised

In [None]:
train_df['is_personalised'].value_counts()

In [None]:
test_df['is_personalised'].value_counts()

###  is_urgency 

In [None]:
train_df['is_urgency'].value_counts()

In [None]:
test_df['is_urgency'].value_counts()

### is_emoticons

In [None]:
train_df['is_emoticons'].value_counts()

In [None]:
test_df['is_emoticons'].value_counts()

### Remark :

* is_price, is_timer : delete columns 
* for the other features need to analyze more



## Feature engineering

In [None]:
train_df_new = train_df.copy()
test_df_new = test_df.copy()

In [None]:
train_df['sender'].value_counts()

In [None]:
train_df['sender'].describe()

In [None]:
train_df_new['sender_map'] = 0
test_df_new['sender_map'] = 0

In [None]:
train_df_new.loc[train_df_new['sender'] != 3, 'sender_map'] = 1
test_df_new.loc[test_df_new['sender'] != 3, 'sender_map'] = 1


In [None]:
columns_deleted = []
columns_deleted.extend(['sender'])

In [None]:
train_df_new['subject_len'].describe()

In [None]:
train_df_new['subject_len'].value_counts()

In [None]:
def subject_map(series):
    if series < 73:
        return 1
    elif (series >=73) and (series < 137):
        return 2
    elif (series >=137) and (series < 201):
        return 3
    elif (series >=201):
        return 4

In [None]:
train_df_new['subject_len_map'] = train_df_new['subject_len'].apply(subject_map)
test_df_new['subject_len_map'] = test_df_new['subject_len'].apply(subject_map)

In [None]:
train_df_new['body_len'].describe()

In [None]:
train_df_new['body_len'].value_counts()

In [None]:
def body_map(series):
    if series < 9554:
        return 1
    elif (series >=9554) and (series < 12689):
        return 2
    elif (series >=12689) and (series < 17351):
        return 3
    elif (series >=17351):
        return 4

In [None]:
train_df_new['body_len_map'] = train_df_new['body_len'].apply(body_map)
test_df_new['body_len_map'] = test_df_new['body_len'].apply(body_map)

In [None]:
train_df_new['mean_paragraph_len'].describe()

In [None]:
train_df_new['mean_paragraph_len'].value_counts()

In [None]:
def mean_paragraph_len_map(series):
    if series < 74:
        return 1
    elif (series >=74) and (series < 144):
        return 2
    elif (series >=144) and (series < 214):
        return 3
    elif (series >=214):
        return 4

In [None]:
train_df_new['mean_paragraph_len_map'] = train_df_new['mean_paragraph_len'].apply(mean_paragraph_len_map)
test_df_new['mean_paragraph_len_map'] = test_df_new['mean_paragraph_len'].apply(mean_paragraph_len_map)

In [None]:
# creating new feature total_num_of_paragraph
train_df_new['total_num_of_paragraph'] = train_df_new['body_len'] / train_df_new['mean_paragraph_len']
test_df_new['total_num_of_paragraph'] = test_df_new['body_len'] / test_df_new['mean_paragraph_len']

In [None]:
train_df_new['total_length'] = train_df_new['subject_len'] + train_df_new['body_len']
test_df_new['total_length'] = test_df_new['subject_len'] + test_df_new['body_len']

In [None]:
train_df_new = train_df_new.join(pd.get_dummies(train_df_new['times_of_day'], prefix='times_of_day'))
test_df_new = test_df_new.join(pd.get_dummies(test_df_new['times_of_day'], prefix='times_of_day'))

In [None]:
columns_deleted.extend(['times_of_day'])

In [None]:
train_df_new['total_length_cta'] = train_df_new['no_of_CTA'] * train_df_new['mean_CTA_len']
test_df_new['total_length_cta'] = test_df_new['no_of_CTA'] * test_df_new['mean_CTA_len']

In [None]:
def no_of_CTA_map(series):
    if series < 12:
        return 1
    elif (series >=12) and (series < 24):
        return 2
    elif (series >=24) and (series < 36):
        return 3
    elif (series >=36):
        return 4

In [None]:
train_df_new['no_of_CTA_bin'] = train_df_new['no_of_CTA'].apply(no_of_CTA_map)
test_df_new['no_of_CTA_bin'] = test_df_new['no_of_CTA'].apply(no_of_CTA_map)

In [None]:
train_df_new['mean_CTA_len'].describe()

In [None]:
def mean_CTA_len_map(series):
    if series < 40:
        return 1
    elif (series >=40) and (series < 120):
        return 2
    elif (series >=120):
        return 3

In [None]:
train_df_new['mean_CTA_len_bin'] = train_df_new['mean_CTA_len'].apply(mean_CTA_len_map)
test_df_new['mean_CTA_len_bin'] = test_df_new['mean_CTA_len'].apply(mean_CTA_len_map)

In [None]:
train_df_new['is_image'].value_counts()

In [None]:
train_df_new['is_image_map'] = 0 
test_df_new['is_image_map'] = 0 

In [None]:
train_df_new.loc[train_df_new['is_image'] > 0, 'is_image_map'] = 1
test_df_new.loc[test_df_new['is_image'] > 0, 'is_image_map'] = 1

In [None]:
train_df_new['is_quote_exist'] = 0 
test_df_new['is_quote_exist'] = 0

In [None]:
train_df_new.loc[train_df_new['is_quote'] > 0, 'is_quote_exist'] = 1
test_df_new.loc[test_df_new['is_quote'] > 0, 'is_quote_exist'] = 1

In [None]:
train_df_new['is_emoticons_exist'] = 0 
test_df_new['is_emoticons_exist'] = 0 

In [None]:
train_df_new.loc[train_df_new['is_emoticons'] > 0, 'is_emoticons_exist'] = 1
test_df_new.loc[test_df_new['is_emoticons'] > 0, 'is_emoticons_exist'] = 1

In [None]:
train_df_new['category_mapped'] = train_df_new['category']
train_df_new.loc[~(train_df_new['category_mapped'].isin([15, 6, 1, 9])),'category_mapped'] = 300
train_df_new = train_df_new.join(pd.get_dummies(train_df_new['category_mapped'], prefix='category_mapped'))

In [None]:
test_df_new['category_mapped'] = test_df_new['category']
test_df_new.loc[~(test_df_new['category_mapped'].isin([15, 6, 1, 9])),'category_mapped'] = 300
test_df_new = test_df_new.join(pd.get_dummies(test_df_new['category_mapped'], prefix='category_mapped'))

In [None]:
train_df_new['product_mapped'] = train_df_new['product']
train_df_new.loc[~(train_df_new['product_mapped'].isin([9, 5, 34])),'product_mapped'] = 300
train_df_new = train_df_new.join(pd.get_dummies(train_df_new['product_mapped'], prefix='product_mapped'))

In [None]:
test_df_new['product_mapped'] = test_df_new['product']
test_df_new.loc[~(test_df_new['product_mapped'].isin([9, 5, 34])),'product_mapped'] = 300
test_df_new = test_df_new.join(pd.get_dummies(test_df_new['product_mapped'], prefix='product_mapped'))

In [None]:
plt.figure(figsize=(15,8))
train_df_new.corr()['click_rate'].sort_values(ascending=False).plot(kind='bar')
plt.show()

In [None]:
columns_deleted.extend(['is_timer','is_emoticons','is_price'])

In [None]:
columns_deleted.extend(['campaign_id', 'click_rate'])

In [None]:
columns_deleted

In [None]:
columns_deleted_1 = ['sender',
 'times_of_day',
 'category',
 'category_mapped',
 'product',
 'product_mapped',
 'is_timer',
 'is_emoticons',
 'is_price']
columns_deleted_1
columns_deleted_1.extend(['campaign_id', 'click_rate'])

In [None]:
columns_deleted_1

In [None]:
train_df_new.info()

In [None]:
X = train_df_new.drop(columns_deleted_1, axis=1)
y = train_df_new['click_rate']

In [None]:
X.shape

In [None]:
columns_deleted_1

In [None]:
submit_df = pd.DataFrame()
submit_df['campaign_id'] = test_df_new['campaign_id']
test_X = test_df_new.drop(['sender',
 'times_of_day',
 'category',
 'category_mapped',
 'product',
 'product_mapped',
 'is_timer',
 'is_emoticons',
 'is_price',
 'campaign_id'], axis=1)

In [None]:
test_X.info()

In [None]:
X.info()

## Checking Different model performance


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15 , random_state = 0)

In [None]:
X_train.head(2)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRFRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [None]:
regressors = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), 
              ExtraTreeRegressor(),HistGradientBoostingRegressor(),XGBRFRegressor(n_estimators=500, max_depth=12, eta=0.1, subsample=0.7, colsample_bytree=0.8),CatBoostRegressor(),AdaBoostRegressor(),XGBRegressor()]
names = ['Linear_Regression', 'DecisionTree', 'Random_Forest','ExtraTree', 'HistGradientBoosting','XGboostRF','catboost','adaboost','xgboost']

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

In [None]:

model_dict = {}
for name, reg in zip(names, regressors):
    
    # training
    reg.fit(X_train, y_train)
    
    # accuracy
    score = reg.score(X_test, y_test)
    model_dict[name] = [r2_score(y_test.values, reg.predict(X_test)), mean_squared_error(y_test.values, reg.predict(X_test)) ]

In [None]:
model_df = pd.DataFrame(model_dict, index = ['R2_score','mean_squared_error']).T
model_df= model_df.sort_values(by=['R2_score', 'mean_squared_error'], ascending=False)
model_df

In [None]:
submit_df = pd.DataFrame()

In [None]:
submit_df['campaign_id'] = test_df_new['campaign_id']

## Catboost

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(loss_function='RMSE')
cat=model.fit( X, y)


In [None]:
y_train_pred_cat = cat.predict(X)
train_score = r2_score(y, y_train_pred_cat)
print(f'catboost train accuracy 'f'{train_score:.3f}')

In [None]:
from catboost import CatBoostRegressor
from catboost import Pool
train_data = Pool(data=X,
                  label=y)

In [None]:
df_feature_importance = pd.DataFrame({'feature_importance': cat.get_feature_importance(train_data), 
              'feature_names': X.columns}).sort_values(by=['feature_importance'], 
                                                       ascending=False)

plt.figure(figsize=(12, 6));
feature_plot= sns.barplot(x="feature_importance", y="feature_names", data=df_feature_importance,palette="cool")
plt.title('features importance')

## XGBoost

In [None]:
xgb = XGBRFRegressor(n_estimators=500, max_depth=12, eta=0.1, subsample=0.7, colsample_bytree=0.8)


In [None]:
xgb.fit(X, y)
y_train_pred_xgb = xgb.predict(X)
train_score = r2_score(y, y_train_pred_xgb)
print(f'xgboost train accuracy 'f'{train_score:.3f}')


In [None]:
ft = xgb.get_booster().get_score(importance_type='weight')
{k: v for k, v in sorted(ft.items(), key=lambda item: item[1])}

In [None]:
from xgboost import plot_importance
plot_importance(xgb) 
plt.show()

## Random Forest

In [None]:
rf = RandomForestRegressor()
rf.fit(X, y)
y_train_pred_rf = rf.predict(X)
train_score = r2_score(y, y_train_pred_rf)
print(f'randomforest train accuracy 'f'{train_score:.3f}')

## Ensemble of 3 Clasifier

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

In [None]:
def get_oof(clf):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((3, x_test.shape[0]))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits = 3, shuffle=True, random_state=10)

In [None]:
xg = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
rf = RandomForestRegressor()
cb = CatBoostRegressor()

In [None]:
x_train = X.copy()
x_test = test_X.copy()
y_train = y.copy()

In [None]:
xg_oof_train, xg_oof_test = get_oof(xg)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

In [None]:
y_pred_l = np.concatenate((xg_oof_test, rf_oof_test, cb_oof_test), axis=1)

In [None]:
y_pred = np.mean(y_pred_l,axis=1)

In [None]:
submit_df['click_rate'] = y_pred

In [None]:
submit_df[['campaign_id', 'click_rate']].to_csv('final_cat_xg.csv', index=False)