# Feature Engineering

## Import Libraries

In [None]:
from datetime import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
import pickle
pd.set_option("display.max_columns", 120)

## Import Datasets

In [None]:
dataset = pd.read_csv('data/cleaned_train_v2.csv', index_col=[0]) # Load cleaned train dataset and remove unnamed column

dataset_test = pd.read_csv('data/cleaned_test_v2.csv', index_col=[0]) # Load cleaned test dataset and remove unnamed column

In [None]:
# Show data types for cleaned train dataset
dataset.info()

In [None]:
# Show data types for cleaned train dataset
dataset_test.info()

### Dealing with PISOX time

In [None]:
# Converting POSIX data from visiStartTime column and replace it in date column
dataset['date'] = pd.to_datetime(dataset['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d')
dataset = dataset.drop('visitStartTime', axis=1)

dataset_test['date'] = pd.to_datetime(dataset_test['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d')
dataset_test = dataset_test.drop('visitStartTime', axis=1)

In [None]:
# Having separate columns for year, month and using new df
dataset = dataset.assign(
     Week = lambda x: pd.to_datetime(x['date']).dt.week,
     Year = lambda x: pd.to_datetime(x['date']).dt.year,
     Month = lambda x: pd.to_datetime(x['date']).dt.month
 )
print(f'Start of year: {dataset.Year.min()}')
print(f'Start of year: {dataset.Year.max()}')

dataset_test = dataset_test.assign(
     Week = lambda x: pd.to_datetime(x['date']).dt.week,
     Year = lambda x: pd.to_datetime(x['date']).dt.year,
     Month = lambda x: pd.to_datetime(x['date']).dt.month
 )
print(f'Start of year: {dataset_test.Year.min()}')
print(f'Start of year: {dataset_test.Year.max()}')


In [None]:
# Show head for cleaned train dataset
dataset.head()

In [None]:
# Show head for cleaned test dataset
dataset_test.head()

## Categorical Features

In [None]:
categorical_cols = list()
for i in dataset.columns:
    if (dataset[i].dtype=='object' or dataset[i].dtype=='bool') and (not(i.startswith('total'))):
        categorical_cols.append(i)

### Removing Customers ID and Date from Categorical Columns

In [None]:
categorical_cols.remove('fullVisitorId')
categorical_cols.remove('date')

In [None]:
categorical_cols

## Numerical Features

In [None]:
numerical_cols = list()
for i in dataset.columns:
    if dataset[i].dtype not in ['object', 'bool']:
        numerical_cols.append(i)

In [None]:
numerical_cols

### Removing Year, Visits ID and Transaction Revenue from Numerical Columns

In [None]:
numerical_cols.remove('Year')

In [None]:
numerical_cols.remove('visitId')

In [None]:
numerical_cols.remove('totals.transactionRevenue')

### Adding "Average Hits per City" and  "Average Pageviews per City" to the Numerical Features

In [None]:
dataset['hits_mean_city'] = dataset.groupby('geoNetwork.city')['totals.hits'].transform('mean').astype('int')

dataset['pageviews_mean_city'] = dataset.groupby('geoNetwork.city')['totals.pageviews'].transform('mean').astype('int')

dataset_test['hits_mean_city'] = dataset_test.groupby('geoNetwork.city')['totals.hits'].transform('mean').astype('int')

dataset_test['pageviews_mean_city'] = dataset_test.groupby('geoNetwork.city')['totals.pageviews'].transform('mean').astype('int')

In [None]:
numerical_cols.append('hits_mean_city')
numerical_cols.append('pageviews_mean_city')

In [None]:
numerical_cols

### Listing numerical features in a variable and changing it to float type

In [None]:
for col in numerical_cols:
    dataset[col] = dataset[col].astype('float')
    dataset_test[col] = dataset_test[col].astype('float')

## Label Encoding the Categorical Features

In [None]:
start_time = datetime.now()
# List categorical features in a variable and changing it to float
for feature in categorical_cols:
    label_encoder = preprocessing.LabelEncoder() # Initialize label encoder object
    label_encoder.fit(list(dataset[feature].values.astype('str')) + list(dataset_test[feature].values.astype('str')))

# Fit with list of variables in that feature
    dataset[feature] = label_encoder.transform(list(dataset[feature].values.astype('str'))) 
    dataset_test[feature] = label_encoder.transform(list(dataset_test[feature].values.astype('str'))) 

# Transform the feature
    print("for this feature : {0} label-encoding was done succesfully".format(feature))
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

### Checking the Dataset Features which dont belong to the Numerical and Categorical Features

In [None]:
set(dataset.columns)-set(numerical_cols+categorical_cols)

### Saving the Numerical and Categorical Features as Numpy Array

In [None]:
nrccl = np.array(numerical_cols)
np.save("data/Numerical_Columns", nrccl)
ctgcl = np.array(categorical_cols)
np.save("data/Categorical_Columns", ctgcl)

## HeatMap for Features Correlation

In [None]:
x_dum = dataset
mask = np.triu(x_dum.corr())
ax = sns.heatmap(round(x_dum.corr()*10,0), cmap="coolwarm", annot=True, mask=mask  )
x_dum.shape
plt.savefig('images/correlogram.png')

## Export Datasets

In [None]:
# Save featured train data to a new .csv file
path = 'data/feat_train_v2.csv'
dataset.to_csv(path)

In [None]:
# Save featured test data to a new .csv file
path1 = 'data/feat_test_v2.csv'
dataset_test.to_csv(path1)