# Load Brunel library and others

In [2]:
%AddJar -magic file:///u/wmlscors/iml-library/brunel/spark-kernel-brunel-all-2.3.jar -f

Starting download from file:///u/wmlscors/iml-library/brunel/spark-kernel-brunel-all-2.3.jar
Finished download of spark-kernel-brunel-all-2.3.jar


In [3]:
import org.apache.spark.ml.feature.{StringIndexer,IndexToString, VectorIndexer, VectorAssembler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.classification.LogisticRegression
import com.ibm.analytics.ngp.ingest.Sampling
import com.ibm.analytics.ngp.pipeline._
import org.apache.spark.ml.PipelineStage
import com.ibm.analytics.ngp.util._
import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary

# Load DB2 Data into DataFrame

In [5]:
val jdbcUser = sys.env("SFJDBC_USER")
val jdbcPass = sys.env("SFJDBC_PASS")
val jdbcHost = sys.env("SFJDBC_HOST")
val jdbcUrl = "jdbc:db2://" + jdbcHost + ":448/RDBNDW00"  
val df = spark.read.format("jdbc").options(Map(
"driver" -> "com.ibm.db2.jcc.DB2Driver",
"url" -> jdbcUrl,
"user" -> jdbcUser,
"password" -> jdbcPass,
"dbtable" -> "MLZ.CRDATA")).load() 

In [6]:
df.show(5)

+------+---+------+-----+-----------------+--------+----------------+
|RATING|AGE|INCOME|CARDS|        EDUCATION|CARLOANS|CREDITREIMBURSED|
+------+---+------+-----+-----------------+--------+----------------+
|    50| 31| 10018|    5|Elementary school|       2|               1|
|    50| 28| 10053|    3|Elementary school|       2|               0|
|   820| 27| 10179|    3|Elementary school|       2|               0|
|    50| 44| 10204|    5|Elementary school|       2|               0|
|    50| 26| 10267|    3|Elementary school|       2|               0|
+------+---+------+-----+-----------------+--------+----------------+
only showing top 5 rows



In [4]:
print(sys.env("SFJDBC_PASS"))
print(sys.env("SFJDBC_USER"))
print(sys.env("SFJDBC_HOST"))


rad1oheaSTEFzb01

# Analyse Data

In [7]:
%%brunel 
data('df') bar x(Age) y(CreditReimbursed) mean(CreditReimbursed) stack axes(x:'Age' ,y:'Percent of reimbursed credit') tooltip(#all)

In [6]:
%%brunel 
data('df') x(Age) y(income) color(CreditReimbursed) tooltip(#all)

In [6]:
%%brunel 
data('df') bar color (CreditReimbursed) x(Age) y(#count) stack tooltip(#all) axes(x:'Age' ,y:'# Credit')

In [8]:
%%brunel 
data('df') bar color (CreditReimbursed) x(cards) y(#count) stack tooltip(#all) axes(x:'Number Credit Card' ,y:'Number of Credit')

In [9]:
%%brunel 
data('df') bar  x(cards) y(CreditReimbursed) mean(CreditReimbursed) stack axes(x:'#Number of credit cards' ,y:'Percent of reimbursed credit')

In [10]:
%%brunel 
data('df') bar color (CreditReimbursed) x(CarLoans) y(#count) stack axes(x:'Number of loans' ,y:'Number of credit')

In [11]:
%%brunel 
data('df') bar  x(CarLoans) y(CreditReimbursed) mean(CreditReimbursed) stack axes(x:'Number of loans' ,y:'Percent of reimbursed credit')

# Prepare Training & Testing Data

# Construct the feature vector

In [7]:
val educationIndexer = new StringIndexer().
setInputCol("EDUCATION").
setOutputCol("Education_Index")
val assembler = new VectorAssembler().setInputCols(Array("AGE","INCOME","CARDS","CARLOANS","Education_Index")).setOutputCol("features")

# IBMSparkpipeline

In [8]:
val train = 80
val test = 20
val splits = Sampling.trainingSplit(df, train, test)
val trainDF = splits._1
val testDF = splits._2
trainDF.cache()
println(trainDF.show(5))

+------+---+------+-----+-------------------+--------+----------------+
|RATING|AGE|INCOME|CARDS|          EDUCATION|CARLOANS|CREDITREIMBURSED|
+------+---+------+-----+-------------------+--------+----------------+
|     0| 69| 41539|    1|      Master degree|       1|               0|
|     1| 65| 20039|    2|    Bachelor degree|       1|               0|
|     2| 60| 95750|    2|      Master degree|       1|               1|
|     3| 61| 89598|    2|      Master degree|       1|               1|
|     5| 62| 55821|    2|Professional degree|       1|               0|
+------+---+------+-----+-------------------+--------+----------------+
only showing top 5 rows

()


In [9]:
import org.apache.spark.ml.feature.SQLTransformer
val sqlTrans = new SQLTransformer().setStatement("SELECT AGE,INCOME,CARDS,EDUCATION,CARLOANS, CAST(CREDITREIMBURSED AS DOUBLE) AS label FROM __THIS__")
val IBMtrainDF =  sqlTrans.transform(trainDF).cache()

In [10]:
val lr = new LogisticRegression().setMaxIter(500).setRegParam(0.0001)
val pipeline = new IBMSparkPipeline().setStages(Array(educationIndexer,assembler,lr))
val model = pipeline.fit(IBMtrainDF)


# Save Model

In [17]:
Connections.setMetaServiceHost("http://10.3.58.47:12501")
model.save("TSOUSRxx/CreditScoringLogisticReg")
println("model saved successfully")

model saved successfully
