# Install and Instatiate Featureform

In [62]:
!pip install featureform==1.10.3

[0mCollecting featureform==1.10.3
  Downloading featureform-1.10.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[0mInstalling collected packages: featureform
  Attempting uninstall: featureform
[0m    Found existing installation: featureform 1.10.3rc0
    Uninstalling featureform-1.10.3rc0:
      Successfully uninstalled featureform-1.10.3rc0
[0mSuccessfully installed featureform-1.10.3


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [63]:
from featureform import local
import featureform as ff

client = ff.Client(local=True)

# Set a run name, to group everything we build in this notebook

everything in featureform is versioned. We generate a default version when its not specificfied, but you can set the default version by using `set_run()`

In [64]:
ff.set_run("webinar")

# Register and read our primary dataset

In [65]:
transactions = local.register_file("transactions", path="./transactions_new.csv")

In [66]:
client.dataframe("transactions", ff.get_run())

Applying Run: webinar
Creating provider local-mode 
Creating source transactions  webinar


Unnamed: 0.1,Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionAmount,Timestamp,IsFraud
0,0,T1,C5841053,10/1/94,JAMSHEDPUR,17819.05,25.0,2022-04-09 11:33:09,False
1,1,T2,C2142763,4/4/57,JHAJJAR,2270.69,27999.0,2022-03-27 01:04:21,True
2,2,T3,C4417068,26/11/96,MUMBAI,17874.44,459.0,2022-04-07 00:48:14,False
3,3,T4,C5342380,14/9/73,MUMBAI,866503.21,2060.0,2022-04-14 07:56:59,True
4,4,T5,C9031234,24/3/88,NAVI MUMBAI,6714.43,1762.5,2022-04-13 07:39:19,False
...,...,...,...,...,...,...,...,...,...
1046193,1048562,T1048563,C8020229,8/4/90,NEW DELHI,7635.19,799.0,2022-04-02 05:12:36,False
1046194,1048563,T1048564,C6459278,20/2/92,NASHIK,27311.42,460.0,2022-04-23 03:58:33,False
1046195,1048564,T1048565,C6412354,18/5/89,HYDERABAD,221757.06,770.0,2022-04-01 15:08:34,False
1046196,1048565,T1048566,C6420483,30/8/78,VISAKHAPATNAM,10117.87,1000.0,2022-04-15 12:50:15,False


# Create a percentage balance transformation for training and an equivalent ondemand feature for serving

In [67]:
@local.df_transformation(inputs=[("transactions", ff.get_run())])
def perc_balance(df):
    df["BalancePercent"] = df["TransactionAmount"]/df["CustAccountBalance"].replace(0.0, 1.0)
    return df

client.dataframe(perc_balance)

Applying Run: webinar
Creating provider local-mode 
Creating source perc_balance  webinar


Unnamed: 0.1,Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionAmount,Timestamp,IsFraud,BalancePercent
0,0,T1,C5841053,10/1/94,JAMSHEDPUR,17819.05,25.0,2022-04-09 11:33:09,False,0.001403
1,1,T2,C2142763,4/4/57,JHAJJAR,2270.69,27999.0,2022-03-27 01:04:21,True,12.330613
2,2,T3,C4417068,26/11/96,MUMBAI,17874.44,459.0,2022-04-07 00:48:14,False,0.025679
3,3,T4,C5342380,14/9/73,MUMBAI,866503.21,2060.0,2022-04-14 07:56:59,True,0.002377
4,4,T5,C9031234,24/3/88,NAVI MUMBAI,6714.43,1762.5,2022-04-13 07:39:19,False,0.262494
...,...,...,...,...,...,...,...,...,...,...
1046193,1048562,T1048563,C8020229,8/4/90,NEW DELHI,7635.19,799.0,2022-04-02 05:12:36,False,0.104647
1046194,1048563,T1048564,C6459278,20/2/92,NASHIK,27311.42,460.0,2022-04-23 03:58:33,False,0.016843
1046195,1048564,T1048565,C6412354,18/5/89,HYDERABAD,221757.06,770.0,2022-04-01 15:08:34,False,0.003472
1046196,1048565,T1048566,C6420483,30/8/78,VISAKHAPATNAM,10117.87,1000.0,2022-04-15 12:50:15,False,0.098835


# Create a rolling window average of transactions

In [68]:
@local.df_transformation(inputs=[("transactions", ff.get_run())])
def window_aggs(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.sort_values(by='Timestamp')
    df.set_index('Timestamp', inplace=True)
    return df.groupby('CustomerID')['TransactionAmount'].rolling(window=30, min_periods=1).agg(["mean", "count"]).reset_index()


client.dataframe(window_aggs)

Applying Run: webinar
Creating provider local-mode 
Creating source window_aggs  webinar


Unnamed: 0,CustomerID,Timestamp,mean,count
0,C1010011,2022-03-31 12:25:23,4750.0,1.0
1,C1010011,2022-04-20 22:31:25,2553.0,2.0
2,C1010012,2022-04-19 23:53:55,1499.0,1.0
3,C1010014,2022-04-06 19:57:12,250.0,1.0
4,C1010014,2022-04-17 18:49:18,727.5,2.0
...,...,...,...,...
1046193,C9099836,2022-04-22 09:31:33,691.0,1.0
1046194,C9099877,2022-04-24 14:31:31,222.0,1.0
1046195,C9099919,2022-04-13 07:42:36,126.0,1.0
1046196,C9099941,2022-04-17 17:56:18,50.0,1.0


# Defining our features, label, and training set

In [69]:
@ff.entity
class User:
    is_fraud = ff.Label(transactions[["CustomerID", "IsFraud", "Timestamp"]], type=ff.Bool)
    avg_trans = ff.Feature(window_aggs[["CustomerID", "mean", "Timestamp"]], type=ff.Float32)
    balance = ff.Feature(transactions[["CustomerID", "CustAccountBalance", "Timestamp"]], type=ff.Float32)
    perc = ff.Feature(perc_balance[["CustomerID", "BalancePercent", "Timestamp"]], type=ff.Float32)
    
ts = ff.register_training_set("fraud", label=User.is_fraud, features=[("avg_trans", ff.get_run()), ("perc", ff.get_run())])

client.apply()

Applying Run: webinar
Creating provider local-mode 
Creating entity user 
Creating feature avg_trans  webinar
Creating feature balance  webinar
Creating feature perc  webinar
Creating label is_fraud  webinar
Creating training-set fraud  webinar
Updating Feature Table: |##################################################| 100% Complete

Updating Feature Table: |##################################################| 100% Complete

Updating Feature Table: |##################################################| 100% Complete



# Training a Model

In [70]:
ts = client.training_set("fraud", "webinar").dataframe()
print(ts)

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = ts.drop('label', axis=1)
y = ts['label']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9805247562607532


# Serving for Inference / Prod

In [72]:
@ff.ondemand_feature()
def ondemand_percent(client, params, entities):
    return params["TransactionAmount"] / client.features([("balance", "webinar")], entities=entities)[0]

client.apply()

Applying Run: webinar
Creating provider local-mode 
Creating ondemand_feature ondemand_percent  webinar


In [73]:
f = client.features([("avg_trans", "webinar"), ("ondemand_percent", ff.get_run())],
                params={"TransactionAmount": 1000.0},
                entities={"user": "C1010011"})
rf.predict([f])

Updating Feature Table: |##################################################| 100% Complete





array([ True])