In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from datetime import datetime, date, timezone

To extract data direct from github:

In [6]:
customers = pd.read_csv("https://raw.githubusercontent.com/dpalacioj/mlops-essentials/udemy-notes/section-02-feature-store/raw/customers.csv")

In [8]:
customers.head(2)

Unnamed: 0,customer_id,name,sex,state,age,is_married,active_since,event_time
0,C1,brooke williams,F,alabama,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z
1,C2,jim reese,M,oregon,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z


In [9]:
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()

In [10]:
customers.drop(columns = ['name', 'state'], inplace=True)

In [12]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   customer_id   10000 non-null  object
 1   sex           10000 non-null  object
 2   age           10000 non-null  int64 
 3   is_married    10000 non-null  bool  
 4   active_since  10000 non-null  object
 5   event_time    10000 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 400.5+ KB


In [17]:
customers['age'].value_counts(normalize=True, sort=True, ascending =False)

age
24    0.0168
42    0.0162
54    0.0161
81    0.0158
31    0.0158
       ...  
80    0.0118
91    0.0116
18    0.0112
65    0.0111
84    0.0105
Name: proportion, Length: 74, dtype: float64

In [21]:
bins = [18, 30, 40, 50, 60, 70, 90]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']
customers['age_range'] = pd.cut(customers['age'], bins, labels=labels, include_lowest=True)
customers.head(3)

Unnamed: 0,customer_id,sex,age,is_married,active_since,event_time,age_range
0,C1,F,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,70-plus
1,C2,M,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,70-plus
2,C3,M,40,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,30-39


The `cut` function is useful for going from a continuous variable to a categorical variable.

In [22]:
df_age_group = pd.get_dummies(customers['age_range'], prefix="age", dtype='int')
df_age_group.head(3)

Unnamed: 0,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,1,0,0,0,0


Once the categorical age features are obtained, they can be `concat` to the original dataframe

In [23]:
customers = pd.concat([customers, df_age_group], axis=1)
customers.head(3)

Unnamed: 0,customer_id,sex,age,is_married,active_since,event_time,age_range,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,C1,F,76,True,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,70-plus,0,0,0,0,0,1
1,C2,M,76,True,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,70-plus,0,0,0,0,0,1
2,C3,M,40,True,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,30-39,0,1,0,0,0,0


In [26]:
customers.drop(columns = ['age', 'age_range'], inplace=True) # Useless columns

Now, it is necesary to convert the `sex` feature from str to int. THe same applies to `is_married`

In [27]:
customers['sex'] = customers['sex'].map({"F":0, "M":1})
customers['is_married'] = customers['is_married'].astype('int') # Remember that 1=True and 0=False
customers.head()

Unnamed: 0,customer_id,sex,is_married,active_since,event_time,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus
0,C1,0,1,2019-03-09 14:43:26,2024-05-02T05:39:10.965Z,0,0,0,0,0,1
1,C2,1,1,2016-07-24 05:21:59,2024-05-02T05:39:10.966Z,0,0,0,0,0,1
2,C3,1,1,2017-11-21 01:14:51,2024-05-02T05:39:10.967Z,0,1,0,0,0,0
3,C4,1,0,2016-11-16 04:44:55,2024-05-02T05:39:10.967Z,0,0,0,0,1,0
4,C5,1,0,2017-03-27 21:09:31,2024-05-02T05:39:10.968Z,0,0,0,0,1,0


In [28]:
customers['active_since'] = pd.to_datetime(customers['active_since'], format ="%Y-%m-%d %H:%M:%S")

In [29]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   customer_id   10000 non-null  object        
 1   sex           10000 non-null  int64         
 2   is_married    10000 non-null  int64         
 3   active_since  10000 non-null  datetime64[ns]
 4   event_time    10000 non-null  object        
 5   age_18-29     10000 non-null  int64         
 6   age_30-39     10000 non-null  int64         
 7   age_40-49     10000 non-null  int64         
 8   age_50-59     10000 non-null  int64         
 9   age_60-69     10000 non-null  int64         
 10  age_70-plus   10000 non-null  int64         
dtypes: datetime64[ns](1), int64(8), object(2)
memory usage: 859.5+ KB
