# teradataml preparing the features and training the model.

## Connect to Vantage

In [1]:
#import the teradataml package for Vantage access
from teradataml import *
import getpass
from teradataml import display
#display.print_sqlmr_query=True
from sqlalchemy.sql.expression import select, case as case_when, func
from sqlalchemy import TypeDecorator, Integer, String
import warnings
warnings.filterwarnings('ignore')

In [2]:
Vantage = 'tdap1627t2.labs.teradata.com'
User = 'alice'
Pass = 'alice'

In [3]:
Vantage = '10.25.251.121'
User = 'USER10'
Pass = 'USER10'

In [4]:
#Pass = getpass.getpass(prompt="pwd: ")

In [5]:
print(Vantage,User)

10.25.251.121 USER10


In [25]:
con = create_context(Vantage, User, Pass)

## Prepare the explain text feature

In [27]:
# Data Set Selection
dbqlog = DataFrame.from_table(in_schema("dbc", "dbqlogtbl")).drop("ZoneId", axis = 1)
dbqlexplain = DataFrame.from_table(in_schema("dbc", "dbqlexplaintbl")).drop("ZoneID", axis = 1)
dbqldata = dbqlog.join(other = dbqlexplain, on = ["QueryID"], lsuffix = "t1", rsuffix = "t2") \
    .select(['t1_QueryID','ExplainText','QueryBand','QueryText'])

In [28]:
# Workaround until ELE-2072.
dbqldata.to_sql('prediction_sentiment', if_exists="replace")
dbqldata = DataFrame.from_table('prediction_sentiment')

## Setup training and join condition features

In [29]:
# Feature Extraction
df_select_query_column_projection = [
     dbqldata.t1_QueryID.expression.label("queryid"),
     dbqldata.ExplainText.expression.label("explaintext"),
     dbqldata.QueryBand.expression.label("queryband"),
     func.REGEXP_SUBSTR(dbqldata.QueryBand.expression, 
                        '(collected_statistics|no_statistics)', 1, 1, 'i').label("training"),
     func.REGEXP_SUBSTR(dbqldata.QueryText.expression, 
                        'SELECT', 1, 1, 'i').label("select_info"),
     func.REGEXP_SUBSTR(func.REGEXP_SUBSTR(dbqldata.ExplainText.expression, 
                        '(joined using a *[A-z \-]+ join,)', 1, 1, 'i'), 
                            '[A-z]+', 15, 1, 'i').label("join_condition")]

In [30]:
training_data = DataFrame.from_query(str(select(df_select_query_column_projection)
                                 .where(Column('join_condition') != None)
                                 .where(Column('training') != None)
                                 .compile(compile_kwargs={"literal_binds": True})))

## Filter to provide the training set and testing set

In [31]:
# Establish training data
training_data = DataFrame.from_query(str(select(df_select_query_column_projection)
                                 .compile(compile_kwargs={"literal_binds": True})))
data_filter = (training_data.join_condition != None) & (training_data.training != None)  \
        & (training_data.select_info != None)
data_set = training_data[data_filter]

In [32]:
# Split data set into training and testing sets
training_set = Sampling(data = data_set, sample_fraction = 0.5, seed = 2).result
testing_set = data_set.join(other = training_data, on = ["queryid<>queryid"], 
                            lsuffix = "t1", rsuffix = "t2")

In [33]:
training_set[training_set.training == 'collected_statistics'].count()

   count_queryid  count_explaintext  count_queryband  count_training  count_select_info  count_join_condition
0              0                  0                0               0                  0                     0

In [34]:
#testing = tsample.result.join(other = training_data, on = ["queryid<>queryid"], lsuffix = "t1", rsuffix = "t2")

In [35]:
#training_set = tsample.result

In [None]:
# Custom sentiment dictionary
dictionary = DataFrame.from_table('dbql_sentiment')

In [36]:
# Extract Confidience 
Features = SentimentExtractor(
    #dict_data = dictionary,
    newdata = training_set,
    level = "document",
    text_column = "explaintext",
    accumulate = ['queryid','join_condition','training']
).result

In [37]:
#help(SentimentExtractor)

In [38]:
Features.count()

   count_queryid  count_join_condition  count_training  count_out_polarity  count_out_strength  count_out_sentiment_words
0              0                     0               0                   0                   0                          0

## These are the features for training the model

In [39]:
#features = td_sentiment_extractor_out.result

## Training and saving the model

In [40]:
# Train model
stats_model = NaiveBayes(
    formula="training ~ out_polarity + join_condition", 
    data=Features)

In [41]:
stats_model

############ STDOUT Output ############

Empty DataFrame
Columns: [class_nb, variable_nb, type_nb, category, cnt, sum_nb, sum_sq, total_cnt]
Index: []

In [23]:
td_save_model(model = stats_model, name = "Stats_collection_model_final")

NameError: name 'td_save_model' is not defined

In [42]:
stats_model.result.to_sql("stats_model_final", if_exists="replace")

In [71]:
target_collection = NaiveBayesPredict(newdata=Features,
                                       modeldata = stats_model,
                                       formula="training ~ out_polarity + join_condition", 
                                       id_col = "queryid",
                                       responses = ["collected_statistics","no_statistics"]
                                       )

In [72]:
summary = target_collection.result.join(other = Features, on = ["queryid"], lsuffix = "t1", 
                                        rsuffix = "t2")

In [73]:
ConfusionMatrix(data = summary, prediction = 'prediction', reference = 'training').accuracytable

                measure c_null collected_statistics no_statistics
0        Detection Rate      0               0.0952        0.5238
1           Sensitivity      0                    1             1
2        Pos Pred Value     NA               0.6667        0.6111
3            Prevalence  0.381               0.0952        0.5238
4  Detection Prevalence      0               0.1429        0.8571
5     Balanced Accuracy    0.5               0.9737          0.65
6        Neg Pred Value  0.619                    1             1
7           Specificity      1               0.9474           0.3

In [48]:
ConfusionMatrix(data = summary, prediction = 'prediction', reference = 'join_condition').accuracytable

                measure  c_null product   merge dynamic collected_statistics  nested no_statistics
0        Detection Rate       0       0       0       0                    0       0             0
1           Sensitivity       0       0       0       0                   NA       0            NA
2        Pos Pred Value      NA      NA      NA      NA                   NA      NA            NA
3            Prevalence  0.2857  0.0476  0.1905  0.1905                    0  0.2857             0
4  Detection Prevalence       0       0       0       0               0.0476       0        0.9524
5     Balanced Accuracy     0.5     0.5     0.5     0.5                   NA     0.5            NA
6        Neg Pred Value  0.7143  0.9524  0.8095  0.8095                   NA  0.7143            NA
7           Specificity       1       1       1       1               0.9524       1        0.0476

In [74]:
accuracy.accuracytable

NameError: name 'accuracy' is not defined

In [49]:
help(NaiveBayes)

Help on class NaiveBayes in module teradataml.analytics.mle.NaiveBayes:

class NaiveBayes(builtins.object)
 |  NaiveBayes(formula=None, data=None, data_sequence_column=None, data_order_column=None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, formula=None, data=None, data_sequence_column=None, data_order_column=None)
 |      DESCRIPTION:
 |          The NaiveBayesMap and NaiveBayesReduce functions generate a model from
 |          training data. A virtual data frame of training data is input to
 |          the NaiveBayesMap function, whose output is the input to
 |          NaiveBayesReduce function, which outputs the model.
 |      
 |      
 |      PARAMETERS:
 |          formula:
 |              Required Argument.
 |              A string consisting of "formula". Specifies the model to be fitted. Only
 |              basic formula of the "col1 ~ col2 + col3 +..." form is supported and
 |              all variables must be from the same virtual data frame object. The
 |    

***