### Install pyspark
- java JDK 8 or 10 is required

In [None]:
!pip install pyspark

In [94]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .getOrCreate()

import hvplot.pandas

In [95]:
df = spark.read.format("csv").option("header", "true").load("cc_default.csv")

In [96]:
df.show()

+---+------------------+---+---------+--------+---+------------------+
| ID|  ln_balance_limit|sex|education|marriage|age|default_next_month|
+---+------------------+---+---------+--------+---+------------------+
|  1| 9.903487552536127|  1|        2|       0| 24|                 1|
|  2|11.695247021764184|  1|        2|       1| 26|                 1|
|  3|11.407564949312402|  1|        2|       1| 34|                 0|
|  4|10.819778284410283|  1|        2|       0| 37|                 0|
|  5|10.819778284410283|  0|        2|       0| 57|                 0|
|  6|10.819778284410283|  0|        1|       1| 37|                 0|
|  7|13.122363377404328|  0|        1|       1| 29|                 0|
|  8|11.512925464970229|  1|        2|       1| 23|                 0|
|  9| 11.84939770159144|  1|        3|       0| 28|                 0|
| 10| 9.903487552536127|  0|        3|       1| 35|                 0|
| 11|12.206072645530174|  1|        3|       1| 34|                 0|
| 12|1

In [97]:
df.createOrReplaceTempView("df")
spark.sql("SELECT * from df WHERE age > 40").show()

+---+------------------+---+---------+--------+---+------------------+
| ID|  ln_balance_limit|sex|education|marriage|age|default_next_month|
+---+------------------+---+---------+--------+---+------------------+
|  5|10.819778284410283|  0|        2|       0| 57|                 0|
| 12|12.468436909997665|  1|        1|       1| 51|                 0|
| 13|13.353475098367715|  1|        2|       1| 41|                 0|
| 18|12.676076274775909|  0|        1|       0| 49|                 0|
| 19|12.793859310432293|  1|        1|       0| 49|                 0|
| 29|10.819778284410283|  1|        3|       0| 47|                 0|
| 34|13.122363377404328|  1|        2|       0| 54|                 0|
| 35|13.122363377404328|  0|        1|       0| 58|                 0|
| 48|11.918390573078392|  1|        5|       1| 46|                 1|
| 51|11.156250521031495|  0|        3|       1| 42|                 1|
| 52|11.512925464970229|  1|        3|       0| 43|                 0|
| 53|1

In [98]:
df

DataFrame[ID: string, ln_balance_limit: string, sex: string, education: string, marriage: string, age: string, default_next_month: string]

In [99]:
for col in df.columns:
    df = df.withColumn(col, df[col].cast(IntegerType()))

In [100]:
df

DataFrame[ID: int, ln_balance_limit: int, sex: int, education: int, marriage: int, age: int, default_next_month: int]

In [101]:
df.write.parquet("output/cc_default.parquet")

In [102]:
df

DataFrame[ID: int, ln_balance_limit: int, sex: int, education: int, marriage: int, age: int, default_next_month: int]

In [103]:
df=spark.read.load('output/cc_default.parquet')
df.createOrReplaceTempView("df")
spark.sql("SELECT * from df WHERE age > (SELECT AVG(age) from df) ").show()

+---+----------------+---+---------+--------+---+------------------+
| ID|ln_balance_limit|sex|education|marriage|age|default_next_month|
+---+----------------+---+---------+--------+---+------------------+
|  4|              10|  1|        2|       0| 37|                 0|
|  5|              10|  0|        2|       0| 57|                 0|
|  6|              10|  0|        1|       1| 37|                 0|
| 12|              12|  1|        1|       1| 51|                 0|
| 13|              13|  1|        2|       1| 41|                 0|
| 18|              12|  0|        1|       0| 49|                 0|
| 19|              12|  1|        1|       0| 49|                 0|
| 21|              11|  1|        3|       1| 39|                 0|
| 22|              11|  1|        2|       0| 39|                 1|
| 24|              13|  1|        1|       0| 40|                 1|
| 29|              10|  1|        3|       0| 47|                 0|
| 34|              13|  1|        

In [104]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=["ln_balance_limit","sex","education", "marriage", "age"],
                                  outputCol="features")

### Whats the best K?

In [105]:
from pyspark.ml.clustering import KMeans

clust = KMeans(k=2, seed=1)

In [106]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vectorAssembler, clust])

model = pipeline.fit(df)

In [107]:
prediction = model.transform(df)
prediction.show()

+---+----------------+---+---------+--------+---+------------------+--------------------+----------+
| ID|ln_balance_limit|sex|education|marriage|age|default_next_month|            features|prediction|
+---+----------------+---+---------+--------+---+------------------+--------------------+----------+
|  1|               9|  1|        2|       0| 24|                 1|[9.0,1.0,2.0,0.0,...|         1|
|  2|              11|  1|        2|       1| 26|                 1|[11.0,1.0,2.0,1.0...|         1|
|  3|              11|  1|        2|       1| 34|                 0|[11.0,1.0,2.0,1.0...|         1|
|  4|              10|  1|        2|       0| 37|                 0|[10.0,1.0,2.0,0.0...|         1|
|  5|              10|  0|        2|       0| 57|                 0|[10.0,0.0,2.0,0.0...|         0|
|  6|              10|  0|        1|       1| 37|                 0|[10.0,0.0,1.0,1.0...|         1|
|  7|              13|  0|        1|       1| 29|                 0|[13.0,0.0,1.0,1.0...|  

In [108]:
prediction.toPandas().hvplot.scatter(x="sex", y="education", by="prediction")

In [109]:
from pyspark.ml.classification import GBTClassifier

In [156]:
classifier = GBTClassifier(maxIter=10, maxDepth=5,featuresCol='features', labelCol="default_next_month",featureSubsetStrategy="auto", seed=42)

In [157]:
pipeline = Pipeline(stages=[vectorAssembler, classifier])

In [158]:
model = pipeline.fit(df)

In [159]:
prediction = model.transform(df).toPandas()

In [160]:
prediction

Unnamed: 0,ID,ln_balance_limit,sex,education,marriage,age,default_next_month,features,rawPrediction,probability,prediction
0,1,9,1,2,0,24,1,"[9.0, 1.0, 2.0, 0.0, 24.0]","[0.1901458729262531, -0.1901458729262531]","[0.5939434665587429, 0.40605653344125714]",0.0
1,2,11,1,2,1,26,1,"[11.0, 1.0, 2.0, 1.0, 26.0]","[0.6133988793981734, -0.6133988793981734]","[0.773257615775933, 0.226742384224067]",0.0
2,3,11,1,2,1,34,0,"[11.0, 1.0, 2.0, 1.0, 34.0]","[0.6409151552003373, -0.6409151552003373]","[0.7827611746760004, 0.21723882532399963]",0.0
3,4,10,1,2,0,37,0,"[10.0, 1.0, 2.0, 0.0, 37.0]","[0.37840157729706764, -0.37840157729706764]","[0.6806592620157812, 0.3193407379842188]",0.0
4,5,10,0,2,0,57,0,"[10.0, 0.0, 2.0, 0.0, 57.0]","[0.38195688840350694, -0.38195688840350694]","[0.6822028533233047, 0.31779714667669534]",0.0
...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,12,0,3,0,39,0,"[12.0, 0.0, 3.0, 0.0, 39.0]","[0.7097360623336488, -0.7097360623336488]","[0.8052556488713468, 0.19474435112865318]",0.0
29996,29997,11,0,3,1,43,0,"[11.0, 0.0, 3.0, 1.0, 43.0]","[0.5739043067479568, -0.5739043067479568]","[0.7591104236588045, 0.24088957634119545]",0.0
29997,29998,10,0,2,1,37,1,"[10.0, 0.0, 2.0, 1.0, 37.0]","[0.4404313807370718, -0.4404313807370718]","[0.7070009745605348, 0.2929990254394652]",0.0
29998,29999,11,0,3,0,41,1,"[11.0, 0.0, 3.0, 0.0, 41.0]","[0.5066932581242688, -0.5066932581242688]","[0.7336823727101561, 0.2663176272898439]",0.0


In [161]:
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(prediction["default_next_month"], prediction["prediction"]))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      1.00      0.00      0.88      0.03      0.00     23364
          1       1.00      0.00      1.00      0.00      0.03      0.00      6636

avg / total       0.83      0.78      0.22      0.68      0.03      0.00     30000

