In [None]:
!pip install pyspark

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz

In [None]:
!tar xf spark-3.3.0-bin-hadoop2.tgz

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.3.0-bin-hadoop2"

In [None]:
!pip install petastorm

In [None]:
!pip install findspark

In [None]:
!pip install tensorflow_decision_forests

In [None]:
import findspark
findspark.init()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .config("spark.csci316.pandas.randomforest", "false") \
    .getOrCreate()
spark

In [None]:
#Read the csv file
data = spark.read.csv("/content/drive/MyDrive/CSCI316/cleanWithHeader.csv", inferSchema=True, header=True)

In [None]:
from petastorm.spark import SparkDatasetConverter, make_spark_converter
import tensorflow_decision_forests as tfdf
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
import math

In [None]:
data.show(10)

In [None]:
data.printSchema()

In [None]:
df_train, df_val = data.randomSplit([0.7, 0.3], seed=135)

df_train = df_train.repartition(2)
df_val = df_val.repartition(2)

In [None]:
df_train.show()

In [None]:
print(f"train: {df_train.count()}, val: {df_val.count()}")

**Convert from PySpark to TensorFlow**

In [None]:
print(f"train: {df_train.count()}, val: {df_val.count()}")

In [None]:
train_dataset = df_train.toPandas()

In [None]:
test_dataset = df_val.toPandas()

In [None]:
x_train = train_dataset.iloc[:,0:42]
y_train = train_dataset.iloc[:,-1]

In [None]:
train_dataset.default_ind.value_counts()

In [None]:
x_test = test_dataset.iloc[:,0:42]
y_test = test_dataset.iloc[:,-1]

In [None]:
x_train.head()

In [None]:
y_train.tail()

In [None]:
y_train.nunique()

In [None]:
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train_dataset, label="default_ind")
tf_test = tfdf.keras.pd_dataframe_to_tf_dataset(test_dataset, label="default_ind")

In [None]:

print(tf_train.take(1))

In [None]:
type(tf_train)

In [None]:
model_1 = tfdf.keras.GradientBoostedTreesModel()

# Train the model.
model_1.fit(tf_train)

Use /tmp/tmpxpb6jnoi as temporary training directory
Reading training dataset...
Training dataset read in 0:00:18.506094. Found 598745 examples.
Training model...


In [None]:
model_1.compile(metrics=["accuracy"])

In [None]:
evaluation = model_1.evaluate(tf_test)



In [None]:
print(evaluation)

[0.0, 0.9980233311653137]


In [None]:
model_1.make_inspector().evaluation()

Evaluation(num_examples=None, accuracy=0.9982679486274719, loss=0.01549600064754486, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)

In [None]:
model_1.make_inspector().variable_importances()

{'SUM_SCORE': [("recoveries" (1; #27), 59427.339578363746),
  ("funded_amnt" (1; #8), 22917.928952304675),
  ("total_rec_prncp" (1; #39), 20991.41198645875),
  ("last_pymnt_amnt_imputed" (1; #17), 2531.328070260977),
  ("id" (1; #12), 2413.482875532965),
  ("out_prncp_inv" (1; #22), 2337.1828587343534),
  ("out_prncp" (1; #21), 1944.550178086155),
  ("total_rec_int" (1; #37), 192.88649803151094),
  ("total_rec_late_fee" (1; #38), 161.58042130123394),
  ("funded_amnt_inv" (1; #9), 116.87479117118316),
  ("installment" (1; #15), 78.67384391905624),
  ("member_id" (1; #19), 57.34374533033689),
  ("total_pymnt" (1; #35), 47.76893257227704),
  ("loan_amnt" (1; #18), 30.29074888687392),
  ("total_pymnt_inv" (1; #36), 28.964883640822517),
  ("int_rate" (1; #16), 13.154074830777331),
  ("term_index" (1; #31), 10.483681754370306),
  ("total_rev_hi_lim_imputed" (1; #40), 10.278072235044933),
  ("tot_cur_bal_imputed" (1; #33), 9.852878550430665),
  ("annual_inc" (1; #1), 9.406792353179931),
  ("d

In [None]:
probs = (model_1.predict(tf_test))



In [None]:
y_pred = probs.round(0)

In [None]:
len(y_pred)

256998

In [None]:
np.unique(y_pred)

array([0., 1.], dtype=float32)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    243145
           1       1.00      0.97      0.98     13853

    accuracy                           1.00    256998
   macro avg       1.00      0.98      0.99    256998
weighted avg       1.00      1.00      1.00    256998

