In [3]:
%load_ext kedro.ipython

In [4]:
import great_expectations as gx
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

In [5]:
df = catalog.load("credits")

In [6]:
context = gx.get_context(context_root_dir = "gx")

In [19]:
!tree gx

Folder PATH listing
Volume serial number is 94EB-8D65
C:\USERS\DGGUA\DOWNLOADS\MLOPS\PROJECT\MLOPS\PROJECT-MLOPS\NOTEBOOKS\GX
+---checkpoints
+---expectations
+---plugins
¦   +---custom_data_docs
¦       +---renderers
¦       +---styles
¦       +---views
+---profilers
+---uncommitted
    +---data_docs
    +---validations


In [21]:
datasource_name = "credit_datasource"
try:
    datasource = context.sources.add_pandas(name=datasource_name)
except:
    print("Data Source already exists.")
    datasource = context.datasources[datasource_name]

Data Source already exists.


In [22]:
context.list_datasources()

[1m[[0m[1m{[0m[32m'type'[0m: [32m'pandas'[0m, [32m'name'[0m: [32m'credit_datasource'[0m[1m}[0m[1m][0m

In [23]:
print(datasource)

name: credit_datasource
type: pandas



In [91]:
suite_credit = context.add_or_update_expectation_suite(expectation_suite_name="credits")

In [92]:
def build_expectations(type, columns, suite_c):

    value = {'checking_status' : ['<0', '0<=X<200', 'no checking', '>=200'],
     'credit_history': ['critical/other existing credit', 'existing paid', 'delayed previously',
 'no credits/all paid', 'all paid'],
     'purpose':['radio/tv', 'education', 'furniture/equipment', 'new car', 'used car',
 'business', 'domestic appliance', 'repairs', 'other', 'retraining'],
     'savings_status':['no known savings', '<100', '500<=X<1000', '>=1000', '100<=X<500'],
     'employment':['>=7', '1<=X<4', '4<=X<7', 'unemployed', '<1'],
     'personal_status':['male single', 'female div/dep/mar', 'male div/sep', 'male mar/wid'],
     'other_parties':['none', 'guarantor', 'co applicant'],
    'property_magnitude':['real estate', 'life insurance', 'no known property', 'car'],
    'other_payment_plans':['none', 'bank', 'stores'],
    'housing':['own', 'for free', 'rent'],
    'job':['skilled', 'unskilled resident', 'high qualif/self emp/mgmt',
 'unemp/unskilled non res'],
    'own_telephone':['yes', 'none'],
    'foreign_worker':['yes', 'no']   
    }

    
    if type == 'numerical':
        columns = ['duration', 'credit_amount', 'installment_commitment',
       'residence_since', 'age', 'existing_credits', 'num_dependents']
        for i in columns:
            suite_c.add_expectation(
                        ExpectationConfiguration(
                            expectation_type="expect_column_values_to_be_of_type",
                            kwargs={"column": i, "type_": "float64"},
                        )
                    )
    elif type == 'categorical':
        for i in columns:
            suite_c.add_expectation(
                        ExpectationConfiguration(
                            expectation_type="expect_column_distinct_values_to_be_in_set",
                            kwargs={"column": i, "value_set": value[i]},
                        )
                    )
    else:
        print(type)
        suite_c.add_expectation(
                        ExpectationConfiguration(
                            expectation_type="expect_column_distinct_values_to_be_in_set",
                            kwargs={"column": 'class', "value_set": ['good', 'bad']},
                        )
                    )
    return suite_c

In [93]:
categorical_features = df.select_dtypes(include=['object']).columns
numerical_f = df.select_dtypes(include=['float64']).columns
categorical_f = categorical_features[:-1]
suite_c = build_expectations( 'numerical', numerical_f, suite_credit)
suite_c = build_expectations('categorical', categorical_f, suite_c)
suite_c = build_expectations('target', df['class'], suite_c)

target


In [94]:
context.save_expectation_suite(expectation_suite=suite_c)

[32m'C:\\Users\\dggua\\Downloads\\MLOPS\\project\\MLOPS\\project-mlops\\notebooks\\gx\\expectations/credits.json'[0m

In [95]:
data_asset_name = "credits"
try:
    data_asset = datasource.add_dataframe_asset(name=data_asset_name, dataframe= df)
except:
    print("The data asset alread exists. The required one will be loaded.")
    data_asset = datasource.get_asset(data_asset_name)

In [96]:
batch_request = data_asset.build_batch_request(dataframe= df)


In [97]:
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name="checkpoint_correct",
    data_context=context,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "credits",
        },
    ],
)
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/17 [00:00<?, ?it/s]

In [98]:
checkpoint_result


[1m{[0m
  [32m"run_id"[0m: [1m{[0m
    [32m"run_name"[0m: null,
    [32m"run_time"[0m: [32m"2024-06-22T14:45:38.878833+01:00"[0m
  [1m}[0m,
  [32m"run_results"[0m: [1m{[0m
    [32m"ValidationResultIdentifier::credits/__none__/20240622T134538.878833Z/credit_datasource-credits"[0m: [1m{[0m
      [32m"validation_result"[0m: [1m{[0m
        [32m"success"[0m: true,
        [32m"results"[0m: [1m[[0m
          [1m{[0m
            [32m"success"[0m: true,
            [32m"expectation_config"[0m: [1m{[0m
              [32m"expectation_type"[0m: [32m"expect_column_values_to_be_of_type"[0m,
              [32m"kwargs"[0m: [1m{[0m
                [32m"column"[0m: [32m"duration"[0m,
                [32m"type_"[0m: [32m"float64"[0m,
                [32m"batch_id"[0m: [32m"credit_datasource-credits"[0m
              [1m}[0m,
              [32m"meta"[0m: [1m{[0m[1m}[0m
            [1m}[0m,
            [32m"result"[0m: [1m{[0m
   

In [64]:

suite_credit2 = context.add_or_update_expectation_suite(expectation_suite_name="Credit3")

In [70]:
suite_credit2.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={"column": 'class', "value_set": ['good','bad']},
                        )
                    )
context.save_expectation_suite(expectation_suite=suite_credit2)
data_asset = datasource.add_dataframe_asset(name='Credit3', dataframe= df['class'])

In [101]:
import pandas as pd
def get_validation_results(checkpoint_result):
    # validation_result is a dictionary containing one key-value pair
    validation_result_key, validation_result_data = next(iter(checkpoint_result["run_results"].items()))

    # Accessing the 'actions_results' from the validation_result_data
    validation_result_ = validation_result_data.get('validation_result', {})

    # Accessing the 'results' from the validation_result_data
    results = validation_result_["results"]
    meta = validation_result_["meta"]
    use_case = meta.get('expectation_suite_name')
    
    
    df_validation = pd.DataFrame({},columns=["Success","Expectation Type","Column","Column Pair","Max Value",\
                                       "Min Value","Element Count","Unexpected Count","Unexpected Percent","Value Set","Unexpected Value","Observed Value"])
    
    
    for result in results:
        # Process each result dictionary as needed
        success = result.get('success', '')
        expectation_type = result.get('expectation_config', {}).get('expectation_type', '')
        column = result.get('expectation_config', {}).get('kwargs', {}).get('column', '')
        column_A = result.get('expectation_config', {}).get('kwargs', {}).get('column_A', '')
        column_B = result.get('expectation_config', {}).get('kwargs', {}).get('column_B', '')
        value_set = result.get('expectation_config', {}).get('kwargs', {}).get('value_set', '')
        max_value = result.get('expectation_config', {}).get('kwargs', {}).get('max_value', '')
        min_value = result.get('expectation_config', {}).get('kwargs', {}).get('min_value', '')

        element_count = result.get('result', {}).get('element_count', '')
        unexpected_count = result.get('result', {}).get('unexpected_count', '')
        unexpected_percent = result.get('result', {}).get('unexpected_percent', '')
        observed_value = result.get('result', {}).get('observed_value', '')
        if type(observed_value) is list:
            #sometimes observed_vaue is not iterable
            unexpected_value = [item for item in observed_value if item not in value_set]
        else:
            unexpected_value=[]
        
        df_validation = pd.concat([df_validation, pd.DataFrame.from_dict( [{"Success" :success,"Expectation Type" :expectation_type,"Column" : column,"Column Pair" : (column_A,column_B),"Max Value" :max_value,\
                                           "Min Value" :min_value,"Element Count" :element_count,"Unexpected Count" :unexpected_count,"Unexpected Percent":unexpected_percent,\
                                                  "Value Set" : value_set,"Unexpected Value" :unexpected_value ,"Observed Value" :observed_value}])], ignore_index=True)
        
    return df_validation

In [102]:
df_validation = get_validation_results(checkpoint_result)

In [103]:
df_validation

Unnamed: 0,Success,Expectation Type,Column,Column Pair,Max Value,Min Value,Element Count,Unexpected Count,Unexpected Percent,Value Set,Unexpected Value,Observed Value
0,True,expect_column_values_to_be_of_type,duration,"(, )",,,,,,,[],float64
1,True,expect_column_values_to_be_of_type,credit_amount,"(, )",,,,,,,[],float64
2,True,expect_column_values_to_be_of_type,installment_commitment,"(, )",,,,,,,[],float64
3,True,expect_column_values_to_be_of_type,residence_since,"(, )",,,,,,,[],float64
4,True,expect_column_values_to_be_of_type,age,"(, )",,,,,,,[],float64
5,True,expect_column_values_to_be_of_type,existing_credits,"(, )",,,,,,,[],float64
6,True,expect_column_values_to_be_of_type,num_dependents,"(, )",,,,,,,[],float64
7,True,expect_column_distinct_values_to_be_in_set,checking_status,"(, )",,,,,,"[<0, 0<=X<200, no checking, >=200]",[],"[0<=X<200, <0, >=200, no checking]"
8,True,expect_column_distinct_values_to_be_in_set,credit_history,"(, )",,,,,,"[critical/other existing credit, existing paid...",[],"[all paid, critical/other existing credit, del..."
9,True,expect_column_distinct_values_to_be_in_set,purpose,"(, )",,,,,,"[radio/tv, education, furniture/equipment, new...",[],"[business, domestic appliance, education, furn..."


In [2]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score

def Model_Logistic(X_train, y_train):

    mlflow.set_tracking_uri("http://127.0.0.1:5000/")

    with mlflow.start_run(run_name="tracking experiment_1", description='checking model') as run:
        rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
        
        rf.fit(X_train, y_train)
        print("1")
    mlflow.end_run()
    
    mlflow.set_experiment("mlflow_first_example")
    model = LogisticRegression(solver='lbfgs', random_state=42)
    model.fit(X_train, y_train)
    return model


def Model_randomForest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

def Model_decisionTree(X_train, y_train):
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model