In [None]:
# Last amended: 23th Sep, 2022
# My folder: /home/ashok/Documents/spark/2.ml/1.demo
##   Objectives:
##  		i)  Usage of StringIndexer, OneHotEncoder
##              and VectorAssembler
##          ii) Data Transformations
##         iii) Modeling
##



Broad Steps
1. Transform categorical data to integers (indices) using StringIndexer
2. Transform indicies to OHE form
3. Transform target seprately to integers (indices) using StrinIndexer
4. Collect all numeric and OHE features in one place using VectorAssembler
5. Perform modeling

Small steps
1. Transform categorical data to integers (indices) using StringIndexer
> i) Create a list of categorical features<br>
>ii) Create a StringIndexer object<br>
>iii)Fit and transform using this object <br>

2. Transform indices to OHE form<br>
>i) Instantiate  an OHE object<br>
>ii)Fit and transform indices createdas a result of 1(iii) above<br>

### Transfer files to hadoop
Start hadoop and issue the following three commands

In [None]:
# 0.0 Transfer bplm.csv.gz to hadoop and check, as:


! hdfs dfs -rm hdfs://localhost:9000/user/ashok/bplm.csv.gz
! hdfs dfs -put /cdata/misc_datasets/bank_loan_modeling/bplm.csv.gz  hdfs://localhost:9000/user/ashok
! hdfs dfs -ls hdfs://localhost:9000/user/ashok   

### Call libraries

In [None]:
## 1.0 Call libraries
# 1.1   For transforming categorical data to integer and to dummy
#       And for collecting all features at one place


from pyspark.ml.feature import  StringIndexer, OneHotEncoder ,VectorAssembler

In [None]:
# 1.2 Logistic Regression modeling:


from pyspark.ml.classification import LogisticRegression

In [None]:
# 1.3 For evaluating results:


from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# 1.4
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Read Data

In [None]:
# 2.0 Read bank data:

df  =   spark.read.csv(
                       path = "hdfs://localhost:9000/user/ashok/bplm.csv.gz",
                       header = True,
                       inferSchema = True
                      )


In [None]:
# 2.1 Display df data, few at a time: 

df.select(df.columns[:10]).show(5)
df.select(df.columns[10:]).show(5)

In [None]:
# 2.1.1 Get data types:

df.dtypes

In [None]:
# 2.2 Write here: 
#            a) list of cat columns
#            b) list of index_col names; cat_cols => index_cols
#            c) List of ohe Col names
#        and d) numeric columns


cat_cols    = ['education', 'mortgage', 'personal_loan', 'securitiesaccount']
index_cols  = ['e11','m11', 'p11', 's11']
ohe_cols    = ['e11vec', 'm11vec', 'p11vec', 's11vec']
num_cols    = ['age', 'experience', 'income', 'family', 'cc_avg']

### StringIndex cat columns
StringIndex cat_cols

In [None]:
# 2.3  Integer index string columns:
# 2.3.1 Instantiate StringIndex class:

si = StringIndexer(
                    inputCols = cat_cols,
                    outputCols = index_cols
                  )

In [None]:
# 2.3.2 Train/fit StringIndexer object:

model = si.fit(df)

In [None]:
# 2.3.3 Transform data and observe:


df = model.transform(df)

### OneHotEncode

In [None]:
# 3.0 One hot encoding of string indexed columns:
#     Instantiate OneHotEncoder class


ohe =    OneHotEncoder(
                       inputCols = index_cols,
                       outputCols =  ['e11vec','m11vec', 'p11vec', 's11vec']
                       )


In [None]:
# 3.1 Fit the data to get ohe_model:

model_ohe = ohe.fit(df)



In [None]:
# 3.2 Transform the data using ohe_model:

df = model_ohe.transform(df)


### StringIndex target

In [None]:
# 4.0 Indexing target separately
#     Generally it is customary to name
#     target as 'label':
#     Instantiate StringIndexer


si_t = StringIndexer(
                      inputCol = 'creditcard',
                      outputCol = 'label'
                    )

In [None]:
# 4.1: Fit StringIndexer object on target


model_label = si_t.fit(df)



In [None]:
# 4.2 Transform target:

df = model_label.transform(df)

### VectorAssembling

In [None]:
# 5.0 Vector Assemble all predictors and transformed features

vc = VectorAssembler(
                    inputCols =  ohe_cols+num_cols,
                    outputCol = 'features'
                    )

In [None]:
# 5.1 Transform df using VectorIndexer object:

df = vc.transform(df)

In [None]:
# 5.3.1 Show some columns and rows:

df.show(3)

### Modeling
Develop logisticRegression model

In [None]:
# 6.0 Instantiate Logistic Regression Class

lr = LogisticRegression()


In [None]:
# 6.1 Train/fit Logistic class

model_lr = lr.fit(df)

### Predictions and evaluations

In [None]:
# 7.0 Make predictions on df 
#     itself using transform() method
#     (There is no predict() method)

df = model_lr.transform(df)


In [None]:
# 7.1 See prediction dataframe columns

df.show(3)



In [None]:
# 8.  We can make use of the BinaryClassificationEvaluator method to
#     evaluate our model.
#     The Evaluator expects two input columns: (rawPrediction, label)
#     and a value of 'metricName'
#     By default -label- parameter has value 'label', 'metricName'
#     has value of "areaUnderROC"

# 8.1 Instantiate evaluate class

bc = BinaryClassificationEvaluator()

# 8.2 Evaluate to retun AUC

bc.evaluate(df)

