In [1]:
import pandas as pd

# Load the Dataset 

In [2]:
df = pd.read_csv('data_large.csv')
print("Dataset Loaded:")
print(df)

Dataset Loaded:
          date  price  units_sold region category  age  target
0   2023-01-17    435           2  South        C   35       0
1   2023-12-04    178           3   West        A   37       0
2   2023-08-20    493           7  South        B   48       1
3   2023-03-05    401          10  North        B   45       0
4   2023-03-01    380           9  South        A   20       1
5   2023-04-11    322           3  North        C   50       0
6   2023-02-07    456           7  South        C   56       0
7   2023-07-26    406           4  South        B   66       1
8   2023-05-25    366           7  North        C   61       0
9   2023-12-01    133           1   West        B   20       1
10  2023-03-30    226           1  South        A   22       0
11  2023-11-04    449           2   West        B   23       0
12  2023-11-25    167           9   East        A   68       0
13  2023-04-10    275           1  South        B   69       1
14  2023-02-14    316           7  Sout

# Handle Date-Time Features

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

print(df.head())

        date  price  units_sold region category  age  target  year  month  day
0 2023-01-17    435           2  South        C   35       0  2023      1   17
1 2023-12-04    178           3   West        A   37       0  2023     12    4
2 2023-08-20    493           7  South        B   48       1  2023      8   20
3 2023-03-05    401          10  North        B   45       0  2023      3    5
4 2023-03-01    380           9  South        A   20       1  2023      3    1


# Create Interaction Features

In [4]:
# Interaction feature: price per unit
df['price_per_unit'] = df['price'] / df['units_sold']

print(df[['price', 'units_sold', 'price_per_unit']].head())


   price  units_sold  price_per_unit
0    435           2      217.500000
1    178           3       59.333333
2    493           7       70.428571
3    401          10       40.100000
4    380           9       42.222222


# Perform Binning

In [5]:
# Age binning
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 70], labels=['Young', 'Middle-aged', 'Senior'])

print(df[['age', 'age_group']].head())


   age    age_group
0   35  Middle-aged
1   37  Middle-aged
2   48  Middle-aged
3   45  Middle-aged
4   20        Young


# Encode Categorical Data

In [6]:
# One-Hot Encoding
df_one_hot = pd.get_dummies(df, columns=['region'])

# Target Encoding
category_means = df.groupby('category')['target'].mean()
df['category_encoded'] = df['category'].map(category_means)

print(df_one_hot.head())
print(df[['category', 'category_encoded']].head())


        date  price  units_sold category  age  target  year  month  day  \
0 2023-01-17    435           2        C   35       0  2023      1   17   
1 2023-12-04    178           3        A   37       0  2023     12    4   
2 2023-08-20    493           7        B   48       1  2023      8   20   
3 2023-03-05    401          10        B   45       0  2023      3    5   
4 2023-03-01    380           9        A   20       1  2023      3    1   

   price_per_unit    age_group  region_East  region_North  region_South  \
0      217.500000  Middle-aged        False         False          True   
1       59.333333  Middle-aged        False         False         False   
2       70.428571  Middle-aged        False         False          True   
3       40.100000  Middle-aged        False          True         False   
4       42.222222        Young        False         False          True   

   region_West  
0        False  
1         True  
2        False  
3        False  
4        Fals

# Apply Polynomial Features

In [7]:
from sklearn.preprocessing import PolynomialFeatures

# Select features for polynomial transformation
numerical_features = df[['price', 'units_sold']]

# Apply polynomial transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(numerical_features)

# Create a new DataFrame for polynomial features
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numerical_features.columns))

print(poly_df.head())


   price  units_sold   price^2  price units_sold  units_sold^2
0  435.0         2.0  189225.0             870.0           4.0
1  178.0         3.0   31684.0             534.0           9.0
2  493.0         7.0  243049.0            3451.0          49.0
3  401.0        10.0  160801.0            4010.0         100.0
4  380.0         9.0  144400.0            3420.0          81.0
