
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/DataSet_Hitelbiralat_joados.csv"
file_type = "csv"


df = spark.read.csv(file_location, header=True, inferSchema=True)

display(df)

id_client,id_shop,sex,marital_status,age,quant_dependants,education,flag_residencial_phone,area_code_residencial_phone,payment_day,shop_rank,residence_type,months_in_residence,flag_mothers_name,flag_fathers_name,flag_residence_town_working_town,flag_residence_state_working_state,months_in_the_job,profession_code,mate_income,flag_residencial_address_postal_address,flag_other_card,quant_banking_accounts,personal_reference_1,personal_reference_2,flag_mobile_phone,flag_contact_phone,personal_net_income,cod_application_booth,quant_additional_cards_in_the_application,flag_card_insurance_option,target
1,22,F,O,44,0,,N,31,12,0,P,12,Y,Y,N,Y,48,731,0.0,Y,N,0,VERA,LUCIA,N,N,300.0,0,0,N,1
2,15,F,S,18,0,,Y,31,20,0,P,216,Y,Y,Y,Y,12,853,0.0,Y,N,0,SARA,FELIPE,N,N,300.0,0,0,N,1
3,24,F,C,22,0,,Y,31,8,0,P,48,Y,N,Y,Y,12,40,0.0,Y,N,0,HELENA,DOMINGOS SOGRA,N,N,229.0,0,0,N,1
4,12,F,C,47,0,,N,31,25,0,P,180,Y,Y,N,Y,24,35,0.0,Y,N,0,JACI,VALERIA ALEXANDRA TRAJANO,N,N,304.0,0,0,N,1
5,16,F,S,28,0,,Y,31,25,0,O,12,Y,Y,Y,Y,12,24,0.0,Y,N,0,MARCIA CRISTINA ZANELLA,SANDRO L P MARTINS,N,N,250.0,0,0,N,1
6,24,M,S,26,0,,N,31,28,0,P,180,Y,Y,N,Y,0,999,0.0,Y,N,0,MARCIO,ANA,N,N,800.0,0,0,N,1
7,55,F,S,22,0,,Y,31,12,0,A,0,Y,Y,Y,Y,48,999,0.0,Y,N,0,FABIO (NOIVO),EDU (AVO),N,N,410.0,0,0,N,1
8,6,F,C,21,0,,Y,23,28,0,A,24,Y,Y,Y,Y,12,40,800.0,Y,N,0,OLIONA MARIA CAMPOS,ELIZETE CAMPS COELHO,N,N,248.0,0,0,N,1
9,3,F,S,27,0,,Y,31,20,0,A,0,Y,Y,Y,Y,0,950,0.0,Y,N,0,SUELI,REGINA,N,N,1000.0,0,0,N,0
10,23,F,C,57,0,,Y,31,12,0,P,24,Y,Y,N,Y,96,13,0.0,Y,N,0,MARIA DE LOURDES,ZILDA,N,N,856.0,0,0,N,1


In [0]:
from pyspark.sql import functions as F

dummy_cols_list = ['sex', 'marital_status', 'residence_type']
dummy_names = []

for col in dummy_cols_list:
    unique_values = [row[col] for row in df.select(col).distinct().collect() if row[col] is not None]
    unique_values.sort()
    dummy_cols = unique_values[1:]
    
    for val in dummy_cols:
        dummy_col_name = f"{col}_{val}"
        dummy_names.append(dummy_col_name)
        df = df.withColumn(dummy_col_name, F.when(F.col(col) == val, 1).otherwise(0))


In [0]:
X_cols = ['age', 'months_in_residence', 'months_in_the_job', 'personal_net_income', 'mate_income']
X_cols += ['payment_day', 'shop_rank', 'profession_code', 'quant_banking_accounts', 'quant_additional_cards_in_the_application', 'id_shop']
X_cols += ['area_code_residencial_phone']
X_cols = X_cols + dummy_names
y_col = 'target'

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [0]:
assembler = VectorAssembler(inputCols=X_cols, outputCol="features")
df = assembler.transform(df)

In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [0]:
lr = LogisticRegression(featuresCol="features", labelCol=y_col)

In [0]:
lr_model = lr.fit(train_df)

In [0]:
predictions = lr_model.transform(test_df)

In [0]:
predictions.select(y_col, "features", "prediction", "probability").show()

+------+--------------------+----------+--------------------+
|target|            features|prediction|         probability|
+------+--------------------+----------+--------------------+
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.18748842996958...|
|     1|(20,[0,2,3,5,7,10...|       1.0|[0.26422958114557...|
|     0|(20,[0,3,5,7,10,1...|       1.0|[0.29043400823445...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.15052745658140...|
|     0|(20,[0,1,2,3,5,7,...|       1.0|[0.37162782553359...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.12258568769092...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.14648152022203...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.23958900842570...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.05486869449470...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.30826448069528...|
|     1|(20,[0,1,3,4,5,7,...|       1.0|[0.09342478314502...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.10055191322670...|
|     1|(20,[0,1,2,3,5,7,...|       1.0|[0.33149263083537...|
|     1|

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol=y_col, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"ROC-AUC: {roc_auc}")

ROC-AUC: 0.6573728794361311
