In [69]:
from math import sqrt
import numpy as np
from joblib import dump
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [70]:
# Getting our FeatureStore
feature_store = FeatureStore(repo_path="Aviation/")

In [71]:
# Retrieving the saved dataset and converting it to a DataFrame
training_df = feature_store.get_saved_dataset(name="aviation_datastore").to_df()



In [72]:
training_df.head()

Unnamed: 0,year,Engine_Type,Purpose_of_flight,Number_of_Engines,Total_Minor_Injuries,Total_Fatal_Injuries,Broad_phase_of_flight,Investigation_Type,Weather_Condition,day,Aircraft_damage,event_timestamp,Total_Uninjured,Total_Serious_Injuries,ratio,Aircraft_Category,month,Event_Id
0,1948,6,20,1.0,0.0,2.0,2,0,1,24,0,1948-10-24 00:00:00+00:00,0.0,0.0,1.0,10,10,20001218X45444
1,1962,6,20,1.0,0.0,4.0,12,0,1,19,0,1962-07-19 00:00:00+00:00,0.0,0.0,1.0,10,7,20001218X45447
2,1974,6,20,1.0,0.360814,3.0,2,0,0,30,0,1974-08-30 00:00:00+00:00,5.311847,0.280921,0.335061,10,8,20061025X01555
3,1977,6,20,1.0,0.0,2.0,2,0,0,19,0,1977-06-19 00:00:00+00:00,0.0,0.0,1.0,10,6,20001218X45448
4,1979,5,20,1.147131,0.360814,1.0,0,0,3,2,0,1979-08-02 00:00:00+00:00,0.0,2.0,0.297547,10,8,20041105X01764


In [73]:
#Records with ratio==NaN are abnormal, i.e. all injury cases are 0, and too many unknown columns. Hence, they should be deleted.
training_df = training_df[training_df['ratio'].notna()]

In [74]:
training_df.shape

(72817, 18)

In [75]:
# independent variables
X = training_df[['Investigation_Type','Aircraft_damage','Aircraft_Category','Number_of_Engines','Engine_Type','Purpose_of_flight',
                 'Total_Serious_Injuries','Total_Minor_Injuries','Total_Uninjured','Weather_Condition','Broad_phase_of_flight',
                 'year','month','day']]

In [76]:
X.head()

Unnamed: 0,Investigation_Type,Aircraft_damage,Aircraft_Category,Number_of_Engines,Engine_Type,Purpose_of_flight,Total_Serious_Injuries,Total_Minor_Injuries,Total_Uninjured,Weather_Condition,Broad_phase_of_flight,year,month,day
0,0,0,10,1.0,6,20,0.0,0.0,0.0,1,2,1948,10,24
1,0,0,10,1.0,6,20,0.0,0.0,0.0,1,12,1962,7,19
2,0,0,10,1.0,6,20,0.280921,0.360814,5.311847,0,2,1974,8,30
3,0,0,10,1.0,6,20,0.0,0.0,0.0,0,2,1977,6,19
4,0,0,10,1.147131,5,20,2.0,0.360814,0.0,3,0,1979,8,2


In [77]:
# dependent variable
y = training_df['ratio']

In [78]:
y.head()

0    1.000000
1    1.000000
2    0.335061
3    1.000000
4    0.297547
Name: ratio, dtype: float64

In [79]:
X.isnull().values.any()

False

In [80]:
X.shape

(72817, 14)

In [81]:
len(y)

72817

In [82]:
y.isnull().sum()

0

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,shuffle=True)

In [84]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((54612, 14), (18205, 14), (54612,), (18205,))

In [85]:
rf = RandomForestRegressor()

# random forest hyperparameters
param_grid = {
            'min_samples_split': [2, 5, 7],
            'max_depth': [5, 10, 15, 20],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': [2, 3, 4],
            'n_estimators': [100, 500, 1000, 1500]
        }

# train random forest using random search cross validation
random_forest = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=3, verbose=2, n_jobs=-1)
random_forest.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [5, 10, 15, 20],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4],
                                        'min_samples_split': [2, 5, 7],
                                        'n_estimators': [100, 500, 1000, 1500]},
                   verbose=2)

In [87]:
rf_hp = random_forest.best_params_

In [88]:
random_forest_ = RandomForestRegressor(n_estimators=rf_hp.get('n_estimators'), bootstrap=True,
                                       max_features=rf_hp.get('max_features'), min_samples_split=rf_hp.get('min_samples_split'),
                                       min_samples_leaf=rf_hp.get('min_samples_leaf'), max_depth=rf_hp.get('max_depth'))

random_forest_.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=7,
                      n_estimators=500)

In [89]:
def evaluate(predicted,actual):
    size = actual.size
    mse = ((predicted - actual) ** 2).sum() / size
    print('MSE =', mse)
    rmse = sqrt(mse)
    print('RMSE =', rmse)
    mae = abs(predicted - actual).sum() / size
    print('MAE =', mae)
    var = ((actual - np.mean(actual)) ** 2).sum() / size
    R2 = 1 - mse / var
    print('R^2 =', R2)

In [90]:
evaluate(random_forest.predict(X_train), y_train)

MSE = 0.005606335039955353
RMSE = 0.07487546353749908
MAE = 0.023365121221609293
R^2 = 0.9589674563243913


In [91]:
evaluate(random_forest.predict(X_test), y_test)

MSE = 0.007127832770205052
RMSE = 0.08442649329567735
MAE = 0.02658460144845935
R^2 = 0.9466281328289996


In [92]:
# Importing dependencies
from feast import FeatureStore
import pandas as pd
from joblib import load

In [93]:
# Getting our FeatureStore
store = FeatureStore(repo_path="C:/Projects/ML_Blocks/test_feast_feature_store_1/breast_cancer")

In [94]:
# Defining our features names
feast_features = [
        "df1_feature_view:mean radius",
        "df1_feature_view:mean texture",
        "df1_feature_view:mean perimeter",
        "df1_feature_view:mean area",
        "df1_feature_view:mean smoothness",
        "df2_feature_view:mean compactness",
        "df2_feature_view:mean concavity",
        "df2_feature_view:mean concave points",
        "df2_feature_view:mean symmetry",
        "df2_feature_view:mean fractal dimension",
        "df3_feature_view:radius error",
        "df3_feature_view:texture error",
        "df3_feature_view:perimeter error",
        "df3_feature_view:area error",
        "df3_feature_view:smoothness error",
        "df3_feature_view:compactness error",
        "df3_feature_view:concavity error",
        "df4_feature_view:concave points error",
        "df4_feature_view:symmetry error",
        "df4_feature_view:fractal dimension error",
        "df4_feature_view:worst radius",
        "df4_feature_view:worst texture",
        "df4_feature_view:worst perimeter",
        "df4_feature_view:worst area",
        "df4_feature_view:worst smoothness",
        "df4_feature_view:worst compactness",
        "df4_feature_view:worst concavity",
        "df4_feature_view:worst concave points",
        "df4_feature_view:worst symmetry",
        "df4_feature_view:worst fractal dimension"
    ]

In [95]:
# Getting the latest features
features = store.get_online_features(
    features=feast_features,    
    entity_rows=[{"patient_id": 568}
                             #,{"patient_id": 567}
                           ]
).to_dict()

In [96]:
features

{'patient_id': [568],
 'mean radius': [7.760000228881836],
 'mean smoothness': [0.052629999816417694],
 'mean texture': [24.540000915527344],
 'mean area': [181.0],
 'mean perimeter': [47.91999816894531],
 'mean compactness': [0.04362000152468681],
 'mean symmetry': [0.15870000422000885],
 'mean fractal dimension': [0.05883999913930893],
 'mean concave points': [0.0],
 'mean concavity': [0.0],
 'perimeter error': [2.5480000972747803],
 'concavity error': [0.0],
 'smoothness error': [0.007189000025391579],
 'compactness error': [0.004660000093281269],
 'area error': [19.149999618530273],
 'texture error': [1.4279999732971191],
 'radius error': [0.385699987411499],
 'worst area': [268.6000061035156],
 'worst texture': [30.3700008392334],
 'worst symmetry': [0.2870999872684479],
 'worst perimeter': [59.15999984741211],
 'worst compactness': [0.06443999707698822],
 'worst fractal dimension': [0.0703900009393692],
 'worst concave points': [0.0],
 'worst concavity': [0.0],
 'fractal dimensio

In [97]:
features_df = pd.DataFrame.from_dict(data=features)

In [98]:
features_df

Unnamed: 0,patient_id,mean radius,mean smoothness,mean texture,mean area,mean perimeter,mean compactness,mean symmetry,mean fractal dimension,mean concave points,...,worst perimeter,worst compactness,worst fractal dimension,worst concave points,worst concavity,fractal dimension error,concave points error,worst radius,worst smoothness,symmetry error
0,568,7.76,0.05263,24.540001,181.0,47.919998,0.04362,0.1587,0.05884,0.0,...,59.16,0.06444,0.07039,0.0,0.0,0.002783,0.0,9.456,0.08996,0.02676


In [99]:
# Loading our model and doing inference
reg = load("C:/Projects/ML_Blocks/test_feast_feature_store_1/model.joblib")
print(features_df[sorted(features_df.drop("patient_id", axis=1))])

   area error  compactness error  concave points error  concavity error  \
0       19.15            0.00466                   0.0              0.0   

   fractal dimension error  mean area  mean compactness  mean concave points  \
0                 0.002783      181.0           0.04362                  0.0   

   mean concavity  mean fractal dimension  ...  worst area  worst compactness  \
0             0.0                 0.05884  ...  268.600006            0.06444   

   worst concave points  worst concavity  worst fractal dimension  \
0                   0.0              0.0                  0.07039   

   worst perimeter  worst radius  worst smoothness  worst symmetry  \
0            59.16         9.456           0.08996          0.2871   

   worst texture  
0      30.370001  

[1 rows x 30 columns]


In [100]:
predictions = reg.predict(features_df[sorted(features_df.drop("patient_id", axis=1))])

In [101]:
predictions

array([1])

In [10]:
from feast import FeatureStore
from datetime import datetime, timedelta

# Getting our FeatureStore
store_m = FeatureStore(repo_path="Aviation/")
# Loading the latest features after a previous materialize call or from the beginning of time
store_m.materialize_incremental(end_date=datetime.now())

In [156]:
# Getting our FeatureStore
fs = FeatureStore(repo_path="Aviation/")

In [11]:
# Defining our features names
feast_features=["aviation_feature_view:Investigation_Type",
                "aviation_feature_view:Aircraft_damage",
                "aviation_feature_view:Aircraft_Category",
                "aviation_feature_view:Number_of_Engines",
                "aviation_feature_view:Engine_Type",
                "aviation_feature_view:Purpose_of_flight",
                "aviation_feature_view:Total_Fatal_Injuries",
                "aviation_feature_view:Total_Serious_Injuries",
                "aviation_feature_view:Total_Minor_Injuries",
                "aviation_feature_view:Total_Uninjured",
                "aviation_feature_view:Weather_Condition",
                "aviation_feature_view:Broad_phase_of_flight",
                "aviation_feature_view:year",
                "aviation_feature_view:month",
                "aviation_feature_view:day"]

In [17]:
# Getting the latest features
#features_ = fs.get_online_features(
features_ = store_m.get_online_features(
    features=feast_features,    
    entity_rows=[{"Event_Id": '20001218X45448'}]
).to_dict()

In [18]:
features_

{'Event_Id': ['20001218X45448'],
 'Engine_Type': [None],
 'Purpose_of_flight': [None],
 'Total_Minor_Injuries': [None],
 'Weather_Condition': [None],
 'Total_Serious_Injuries': [None],
 'Aircraft_Category': [None],
 'month': [None],
 'Aircraft_damage': [None],
 'Broad_phase_of_flight': [None],
 'day': [None],
 'Investigation_Type': [None],
 'Total_Fatal_Injuries': [None],
 'Number_of_Engines': [None],
 'Total_Uninjured': [None],
 'year': [None]}

In [14]:
import pandas as pd
# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=features_)

In [15]:
features_df

Unnamed: 0,Event_Id,Engine_Type,Purpose_of_flight,Total_Minor_Injuries,Weather_Condition,Total_Serious_Injuries,Aircraft_Category,month,Aircraft_damage,Broad_phase_of_flight,day,Investigation_Type,Total_Fatal_Injuries,Number_of_Engines,Total_Uninjured,year
0,20001218X45448,,,,,,,,,,,,,,,


In [16]:
A = training_df[training_df['Event_Id']=='20001218X45448']
A

NameError: name 'training_df' is not defined

In [163]:
A.columns

Index(['year', 'Engine_Type', 'Purpose_of_flight', 'Number_of_Engines',
       'Total_Minor_Injuries', 'Total_Fatal_Injuries', 'Broad_phase_of_flight',
       'Investigation_Type', 'Weather_Condition', 'day', 'Aircraft_damage',
       'event_timestamp', 'Total_Uninjured', 'Total_Serious_Injuries', 'ratio',
       'Aircraft_Category', 'month', 'Event_Id'],
      dtype='object')

In [164]:
X.columns

Index(['Investigation_Type', 'Aircraft_damage', 'Aircraft_Category',
       'Number_of_Engines', 'Engine_Type', 'Purpose_of_flight',
       'Total_Serious_Injuries', 'Total_Minor_Injuries', 'Total_Uninjured',
       'Weather_Condition', 'Broad_phase_of_flight', 'year', 'month', 'day'],
      dtype='object')

In [165]:
# Loading our model and doing inference
reg = load("model.joblib")
predictions = reg.predict(A[sorted(A.drop(["Event_Id","event_timestamp","ratio","Total_Fatal_Injuries"], axis=1))])

Feature names must be in the same order as they were in fit.



In [166]:
predictions

array([0.14613594])