# <b> Recoding </b>
___

<b> Import modules </b>

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

<b> Read csv file to a pandas dataframe </b>

In [2]:
df = pd.read_csv("data/online_shoppers_intention.csv")

<b> Features grouped by data type </b>

In [3]:
df.columns.to_series().groupby(df.dtypes).groups

{dtype('bool'): Index(['Weekend', 'Revenue'], dtype='object'),
 dtype('int64'): Index(['Administrative', 'Informational', 'ProductRelated', 'OperatingSystems',
        'Browser', 'Region', 'TrafficType'],
       dtype='object'),
 dtype('float64'): Index(['Administrative_Duration', 'Informational_Duration',
        'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues',
        'SpecialDay'],
       dtype='object'),
 dtype('O'): Index(['Month', 'VisitorType'], dtype='object')}

### 1. Recode Features/Target
___

<b>  1.1 Boolean Type: </b> 
<br> False/True into 0/1
> - Weekend (-> Feature)
- Revenue (-> Target Variable)

In [4]:
# define function for converting Boolean (True/False) into (1's and 0's)
def convert_bool_int(columnlst):
    df[columnlst] = df[columnlst].astype(int)
    
convert_bool_int(['Weekend', 'Revenue'])
# show unique value counts for each category (0/1)
#df['Weekend'].value_counts()
#df['Revenue'].value_counts()

<b> 1.2 Object Type </b>
<br> String into Integer (categorical)
> - VisitorType (-> Feature)
- Month (-> Feature)

#### 1.2.1 VisitorType

In [5]:
# Convert VisitorType from object to integer (categorical)
df['VisitorType_lbl'] = df['VisitorType'].map({'New_Visitor': 0, 'Returning_Visitor': 1, 'Other': 2})

<b> Show result remapping for the VisitorType </b>

In [6]:
df.groupby(by=['VisitorType'], as_index=False)['VisitorType_lbl'].first().sort_values('VisitorType_lbl').reset_index(drop=True)

Unnamed: 0,VisitorType,VisitorType_lbl
0,New_Visitor,0
1,Returning_Visitor,1
2,Other,2


In [7]:
# test if only unique values 0,1,2 remain for relabeled feature: 'VisitorType_lbl'
df['VisitorType_lbl'].unique()

array([1, 0, 2])

#### 1.2.2 Month

In [8]:
# show unique months
df['Month'].unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

In [9]:
# Recode Month - defensive coding (included all the months of the year in the remapping of the feature although we are missing January, April)
df['Month_lbl'] = df['Month'].map({'Jan': 1, 'Feb':2, 'Mar':3, 'Apr': 4, 'May':5, 'June':6, 'Jul':7, 'Aug': 8, 'Sep':9, 'Oct': 10, 'Nov': 11, 'Dec':12})

<b> _Show result remapping of feature for the Month_ </b>

In [10]:
# show the old column Month and compare it with Month_lbl to see if the mapping worked
# source: https://stackoverflow.com/questions/48131812/get-unique-values-of-multiple-columns-as-a-new-dataframe-in-pandas
df.groupby(by=['Month'], as_index=False)['Month_lbl'].first().sort_values('Month_lbl').reset_index(drop=True)

Unnamed: 0,Month,Month_lbl
0,Feb,2
1,Mar,3
2,May,5
3,June,6
4,Jul,7
5,Aug,8
6,Sep,9
7,Oct,10
8,Nov,11
9,Dec,12


In [11]:
# test if all values are replaced and no other values remain
df['Month_lbl'].unique()

array([ 2,  3,  5, 10,  6,  7,  8, 11,  9, 12])

<b>  1.3 Encode Categorical Features: One Hot Encoding </b> 
<br> _Note_: categorical features with more than 5 categories can be One Hot Encoded
> - Month                   (10 categories)
- SpecialDay                (6 categories)
- OperatingSystems          (8 categories)
- Browser                   (13 categories)
- Region                    (9 categories)
- TrafficType               (20 categories)

<b> Show the number of Unique values for our categorical features

In [12]:
def feature_unique_values(df_name, lst_columns):
    print('***********************************************************************************')
    print('                    Unique values for Categorical Feature(s):')
    print('***********************************************************************************')
    for i in df_name[lst_columns]:
        list_unique_values = df_name[i].unique()
        list_unique_values.sort()
        length_list = len(list_unique_values)
        print(i, ':', list_unique_values, '\n', '- Unique Values:', length_list)
    print('***********************************************************************************')
lst_features = ['Month', 'Weekend', 'SpecialDay', 'VisitorType', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']
feature_unique_values(df_name=df, lst_columns=lst_features)

***********************************************************************************
                    Unique values for Categorical Feature(s):
***********************************************************************************
Month : ['Aug' 'Dec' 'Feb' 'Jul' 'June' 'Mar' 'May' 'Nov' 'Oct' 'Sep'] 
 - Unique Values: 10
Weekend : [0 1] 
 - Unique Values: 2
SpecialDay : [0.  0.2 0.4 0.6 0.8 1. ] 
 - Unique Values: 6
VisitorType : ['New_Visitor' 'Other' 'Returning_Visitor'] 
 - Unique Values: 3
OperatingSystems : [1 2 3 4 5 6 7 8] 
 - Unique Values: 8
Browser : [ 1  2  3  4  5  6  7  8  9 10 11 12 13] 
 - Unique Values: 13
Region : [1 2 3 4 5 6 7 8 9] 
 - Unique Values: 9
TrafficType : [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20] 
 - Unique Values: 20
***********************************************************************************


<b> One Hot Encoding </b> for categorical features with >5 categories
<br> _source:_ https://pbpython.com/categorical-encoding.html

In [13]:
df = pd.get_dummies(df, columns=['Month', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType'], prefix=['Month', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType'], dtype=int)   

In [18]:
# test if it worked by showing the additional columnnames and respective data types
#list(df.columns)
df.info()
# show the first 5 rows of the one hot encoded features
#df.iloc[:, 14:].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 82 columns):
Administrative                   12330 non-null int64
Administrative_Duration          12330 non-null float64
Informational                    12330 non-null int64
Informational_Duration           12330 non-null float64
ProductRelated                   12330 non-null int64
ProductRelated_Duration          12330 non-null float64
BounceRates                      12330 non-null float64
ExitRates                        12330 non-null float64
PageValues                       12330 non-null float64
Weekend                          12330 non-null int64
Revenue                          12330 non-null int64
VisitorType_lbl                  12330 non-null int64
Month_lbl                        12330 non-null int64
Month_Aug                        12330 non-null int64
Month_Dec                        12330 non-null int64
Month_Feb                        12330 non-null int64
Month_Jul      

### __Save Recoded Dataset to a CSV File__ #
___

In [17]:
# save a copy of the dataframe under 'df_recoded'
# drop old columns which are labeled as new columns (VisitorType_lbl)
# note: the get_dummies syntax already dropped the original Month feature with one-hot encoding  
df_recoded = df.drop(['VisitorType_lbl', 'Month_lbl'], axis=1)

# define name of the csv file
file_name = 'df_recoded.csv'

# export pandas dataframe to csv
df_recoded.to_csv('../project2-mtb/' + file_name, index=False)