In [1]:
pip install -U pandas-profiling;


The following command must be run outside of the IPython shell:

    $ pip install -U pandas-profiling;

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [None]:
import pandas_profiling
import pandas as pd
from sklearn import datasets
import numpy as np
import time

In [None]:
data = datasets.load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)

In [None]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [None]:
#https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/index.html 
report = pandas_profiling.ProfileReport(df)

In [None]:
report

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# lifted from https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            '''
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True) 
                
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True
            '''

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [None]:
reduce_mem_usage(df)

Memory usage of properties dataframe is : 0.0503082275390625  MB
******************************
Column:  CRIM
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  ZN
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  INDUS
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  CHAS
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  NOX
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  RM
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  AGE
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  DIS
dtype before:  float64
dtype after:  float32
****

(        CRIM    ZN  INDUS  CHAS  ...    TAX    PTRATIO           B  LSTAT
 0    0.00632  18.0   2.31   0.0  ...  296.0  15.300000  396.899994   4.98
 1    0.02731   0.0   7.07   0.0  ...  242.0  17.799999  396.899994   9.14
 2    0.02729   0.0   7.07   0.0  ...  242.0  17.799999  392.829987   4.03
 3    0.03237   0.0   2.18   0.0  ...  222.0  18.700001  394.630005   2.94
 4    0.06905   0.0   2.18   0.0  ...  222.0  18.700001  396.899994   5.33
 ..       ...   ...    ...   ...  ...    ...        ...         ...    ...
 501  0.06263   0.0  11.93   0.0  ...  273.0  21.000000  391.989990   9.67
 502  0.04527   0.0  11.93   0.0  ...  273.0  21.000000  396.899994   9.08
 503  0.06076   0.0  11.93   0.0  ...  273.0  21.000000  396.899994   5.64
 504  0.10959   0.0  11.93   0.0  ...  273.0  21.000000  393.450012   6.48
 505  0.04741   0.0  11.93   0.0  ...  273.0  21.000000  396.899994   7.88
 
 [506 rows x 13 columns], [])

# Feature Scaling

### Motivation

Why do we apply feature scaling?

 - Preprocessing technique on features to normalize the data into a specific range:- example `[0, 1]`, `[-1, 1]`

 - The ability to visualize data with different ranges at the same time

 - In some ML algorithms the error is calculated using norms (L1, L2 etc.) and the impact of features with a larger range will be higher.

 - In some algorithms, applying on data with smaller ranger is computationally  more efficient. (example Faster conversion for Gradient Descent)

## Implementations

### Min - Max Normalization

Normalizing the data between a specific range using the min & max values. Usaually conversion into `[0,1]` range.
\begin{equation*}
Normalized = \frac{x - min(x)}{max(x) - min(x)}
\end{equation*}

And for the general equation to normalize into `[a,b]` range:

\begin{equation*}
Normalized = a + \frac{(x - min(x))(b - a)}{max(x) - min(x)}
\end{equation*}

In [None]:
# Normal Approach
#%%time 
start_time = time.time()
df_minmax_norm = (df - df.min()) / (df.max() - df.min())
print(f"Execution Time = {time.time() - start_time} using normal approach")
df_minmax_norm.head()


Execution Time = 0.018030881881713867 using normal approach
CPU times: user 17.2 ms, sys: 1.02 ms, total: 18.2 ms
Wall time: 18.6 ms


In [None]:
# Lambda Approach
start_time = time.time()
df_minmax_norm = df.apply(lambda x : (x - x.min()) / (x.max() - x.min()))
print(f"Execution Time = {time.time() - start_time} using lambda approach")
df_minmax_norm.head()

Execution Time = 0.012943029403686523 using lambda approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


In [None]:
# Skrlearn Approach
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
start_time = time.time()
df_minmax_norm = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)
print(f"Execution Time = {time.time() - start_time} using sklearn approach")
df_minmax_norm.head()

Execution Time = 0.0029668807983398438 using sklearn approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


### Z score Normalization (Standardisation)

Normalizing the data to a standerdized score and the data has a **mean of 0** & **std of 1**.
\begin{equation*}
Normalized = \frac{x - \mu}{\sigma}
\end{equation*}


In [None]:
# Normal Approach
#%%time 
start_time = time.time()
df_zscore_norm = (df - df.mean()) / (df.std())
print(f"Execution Time = {time.time() - start_time} using normal approach")
df_zscore_norm.head()

Execution Time = 0.015212297439575195 using normal approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419367,0.284549,-1.286639,-0.27233,-0.144079,0.413261,-0.119896,0.140075,-0.98187,-0.665949,-1.457581,0.44061,-1.074499
1,-0.416927,-0.487242,-0.592797,-0.27233,-0.739534,0.194081,0.366803,0.556609,-0.867024,-0.986353,-0.302818,0.44061,-0.491952
2,-0.416929,-0.487242,-0.592797,-0.27233,-0.739534,1.281444,-0.26555,0.556609,-0.867024,-0.986353,-0.302818,0.396029,-1.207532
3,-0.416338,-0.487242,-1.305588,-0.27233,-0.834462,1.015296,-0.809088,1.076671,-0.752177,-1.105021,0.112898,0.415746,-1.360171
4,-0.412074,-0.487242,-1.305588,-0.27233,-0.834462,1.22736,-0.510675,1.076671,-0.752177,-1.105021,0.112898,0.44061,-1.025487


In [None]:
# Lambda Approach
start_time = time.time()
df_zscore_norm = df.apply(lambda x : (x - x.mean()) / (x.std()))
print(f"Execution Time = {time.time() - start_time} using lambda approach")
df_zscore_norm.head()

Execution Time = 0.016427993774414062 using lambda approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419367,0.284549,-1.286639,-0.27233,-0.144079,0.413261,-0.119896,0.140075,-0.98187,-0.665949,-1.457581,0.44061,-1.074499
1,-0.416927,-0.487242,-0.592797,-0.27233,-0.739534,0.194081,0.366803,0.556609,-0.867024,-0.986353,-0.302818,0.44061,-0.491952
2,-0.416929,-0.487242,-0.592797,-0.27233,-0.739534,1.281444,-0.26555,0.556609,-0.867024,-0.986353,-0.302818,0.396029,-1.207532
3,-0.416338,-0.487242,-1.305588,-0.27233,-0.834462,1.015296,-0.809088,1.076671,-0.752177,-1.105021,0.112898,0.415746,-1.360171
4,-0.412074,-0.487242,-1.305588,-0.27233,-0.834462,1.22736,-0.510675,1.076671,-0.752177,-1.105021,0.112898,0.44061,-1.025487


In [None]:
# Skrlearn Approach
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
start_time = time.time()
df_zscore_norm = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)
print(f"Execution Time = {time.time() - start_time} using sklearn approach")
df_zscore_norm.head()

Execution Time = 0.003198862075805664 using sklearn approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.28791,-0.272599,-0.144218,0.413672,-0.120014,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867882,-0.987329,-0.303095,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867882,-0.987329,-0.303095,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228576,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026502


### Normalization by decimal scaling

Normalizing the data by moving the decimal place until the max_value after scaling < 1, max(|x`|) < 1
\begin{equation*}
Normalized = \frac{x}{10^j}
\end{equation*}


In [None]:
start_time = time.time()
max_values = df.abs().max().astype(int)
max_values = max_values.astype(str).apply(lambda x : len(x))
df_decimel_norm = (df) / (10**max_values)
print(f"Execution Time = {time.time() - start_time} using normal approach")
df_decimel_norm.head()

Execution Time = 0.016589641571044922 using normal approach


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,6.3e-05,0.018,0.0231,0.0,0.0538,0.6575,0.0652,0.0409,0.01,0.296,0.153,0.3969,0.0498
1,0.000273,0.0,0.0707,0.0,0.0469,0.6421,0.0789,0.049671,0.02,0.242,0.178,0.3969,0.0914
2,0.000273,0.0,0.0707,0.0,0.0469,0.7185,0.0611,0.049671,0.02,0.242,0.178,0.39283,0.0403
3,0.000324,0.0,0.0218,0.0,0.0458,0.6998,0.0458,0.060622,0.03,0.222,0.187,0.39463,0.0294
4,0.000691,0.0,0.0218,0.0,0.0458,0.7147,0.0542,0.060622,0.03,0.222,0.187,0.3969,0.0533


# List Comprehension

## Motivation

Why do we use list comprehensions?

 - Faster than for loops, given the interpreter can optimize patterns more efficiently.

 - Less code to be written

 - An elegent approach for many functionalities 

## Examples

### Basic Usage

In [None]:
nums = [1, 1, 2, 3, 5, 8]

# Expression, member, iterable
num_list = [el**2 for el in nums]
num_list

[1, 1, 4, 9, 25, 64]

### Using if

In [None]:
num_list = [el / 2 for el in nums if el % 2 == 0]
num_list

[1.0, 4.0]

### Using if-else

In [None]:
num_list = [el / 2 if el % 2 == 0 else ((el + 1) / 2) for el in nums]
num_list

[1.0, 1.0, 1.0, 2.0, 3.0, 4.0]

### Nested conditions

In [None]:
num_list = [el / 2 if el % 2 == 0 else ((el + 1) / 2) if el % 2 == 1 else 0 for el in nums]
num_list

[1.0, 1.0, 1.0, 2.0, 3.0, 4.0]

### Set Comprehension

In [None]:
num_list = {el / 2 if el % 2 == 0 else ((el + 1) / 2) for el in nums}
num_list

{1.0, 2.0, 3.0, 4.0}

### Dictionary Comprehension

In [None]:
num_list = {el: el**2 for el in nums}
num_list

{1: 1, 2: 4, 3: 9, 5: 25, 8: 64}

### List Comprehension in pandas

In [None]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.199997,4.09,1.0,296.0,15.3,396.899994,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.900002,4.9671,2.0,242.0,17.799999,396.899994,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.099998,4.9671,2.0,242.0,17.799999,392.829987,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.799999,6.0622,3.0,222.0,18.700001,394.630005,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200001,6.0622,3.0,222.0,18.700001,396.899994,5.33


In [None]:
df['B_YEAR'] = [int(2020 - el) for el in df['AGE']]

In [None]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,B_YEAR
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.199997,4.09,1.0,296.0,15.3,396.899994,4.98,1954
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.900002,4.9671,2.0,242.0,17.799999,396.899994,9.14,1941
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.099998,4.9671,2.0,242.0,17.799999,392.829987,4.03,1958
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.799999,6.0622,3.0,222.0,18.700001,394.630005,2.94,1974
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200001,6.0622,3.0,222.0,18.700001,396.899994,5.33,1965
