# Transforming Numerical Variables


Transforming variables with the logarithm function

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import  scipy.stats as stats
from sklearn.datasets import fetch_california_housing

In [2]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

X.hist(bins=30, figsize=(12, 12))

plt.show()

In [3]:
def diagnostic_plots(df, variable):
        plt.figure(figsize=(15,6))
        plt.subplot(1, 2, 1)
        df[variable].hist(bins=30)
        plt.title(f"Histogram of {variable}")
        plt.subplot(1, 2, 2)
        stats.probplot(df[variable], dist="norm", plot=plt)
        plt.title(f"Q-Q plot of {variable}")
        plt.show()
        
diagnostic_plots(X,"MedInc")

In [4]:
# let’s make a copy of the original DataFrame using pandas copy()
X_tf = X.copy()

#make a list with the variables that we want to transform
variables = ["MedInc", "AveRooms", "AveBedrms", "Population"]

#logo transformayion
X_tf[variables] = np.log(X[variables])

#we saw that previous right skwed it will transform normal standevation
diagnostic_plots(X_tf, "MedInc")

X_tf

In [5]:
from sklearn.preprocessing import FunctionTransformer

tranform = FunctionTransformer(np.log,inverse_func=np.exp)

X_tf[variables] = tranform.transform(X[variables])



In [6]:
#this function reverse the data back to normal data 
X_tf[variables] = tranform.inverse_transform(X_tf[variables])


In [7]:
#using feature_engine

from feature_engine.transformation import LogTransformer


lt = LogTransformer(variables=variables)
lt.fit(X)

X_tf = lt.transform(X)

X_tf

# Transforming variables with the reciprocal function

In [8]:
X, y = fetch_california_housing(return_X_y=True,as_frame=True)


#plot 
diagnostic_plots(X, "AveOccup")

In [9]:
X_tf = X.copy()

X_tf["AveOccup"] = np.reciprocal(X_tf["AveOccup"])


#using the transformation with the diagnostic fucntion

diagnostic_plots(X_tf, "AveOccup")

In [10]:
#using sklearn
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.reciprocal)

X_tf = X.copy()

X_tf["AveOccup"] = transformer.transform(X["AveOccup"])


In [11]:
from feature_engine.transformation import ReciprocalTransformer

rt = ReciprocalTransformer(variables="AveOccup")

rt.fit(X)

data = rt.transform(X)

diagnostic_plots(data, "AveOccup")

# Using the square root to transform variables

#Anscombe transformation , Freeman-Tukey transformation, √x + √(x+1)

The Poisson distribution is a probability distribution that indicates the number of times an event is likely to occur. In other words, it is a count distribution. It is right-skewed and its variance equals its mean. Examples of variables that could follow a Poisson distribution are the number of financial items of a customer, such as the number of current accounts or credit cards, the number of passengers in a vehicle, and the number of occupants in a household

In [12]:
import numpy as np ;import pandas as pd

import scipy.stats as stats

df = pd.DataFrame()

df["counts1"] = stats.poisson.rvs(mu=3, size=10000)

df["counts2"] = stats.poisson.rvs(mu=2, size=10000)


def diagnostic_plots(df, variable):
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].value_counts().sort_index(). plot.bar()
    plt.title(f"Histogram of {variable}")
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.title(f"Q-Q plot of {variable}")
    plt.show()
    
    
diagnostic_plots(df, "counts2")

In [13]:
df_tf = df.copy()

df_tf[["counts1", "counts2"]] = np.sqrt(df[["counts1","counts2"]])


df_tf[["counts1", "counts2"]] = np.round(df_tf[["counts1", "counts2"]], 2)


diagnostic_plots(df_tf, "counts1")

In [14]:
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.sqrt)

df_tf = transformer.transform(df[["counts1", "counts2"]])

In [15]:
from feature_engine.transformation import PowerTransformer

root_t = PowerTransformer(exp=1/2)

root_t.fit(df)

df_tf = root_t.transform(df)



In [16]:

# Assuming you have a dataset 'df' with variables 'x' and 'y'

# Perform the Anscombe transformation
df['x_anscombe'] = 2 * np.sqrt(df['counts1'] + 3/8)
df['y_anscombe'] = 2 * np.sqrt(df['counts2'] + 3/8)

# Check the transformed variables
print(df[['x_anscombe', 'y_anscombe']])

diagnostic_plots(df, "y_anscombe")

#Using power transformations

In [None]:
X_tf = X.copy()

diagnostic_plots(X, "Population")

In [None]:
variables = ["MedInc", "Population"]

X_tf[variables] = np.power(X[variables], 0.3)