## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [2]:
# File location and type
file_location = "/FileStore/tables/Q4_2018.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0,Apr-2001,1,,45.0,9,1,4341,10.3,34,w,2386.02,2386.02,167.02,167.02,113.98,53.04,0.0,0.0,0.0,Feb-2019,84.92,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,16901,2,2,1,2,2.0,12560,69.0,2,7,2137,28,42000,1,11,2,9,1878,34360.0,5.9,0,0,140.0,212,1,1,0,1.0,,2.0,,0,2,5,3,3,16,7,18,5,9,0.0,0,0,3,100.0,0.0,1,0,60124,16901,36500,18124,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0,Jun-1987,0,71.0,75.0,13,1,12315,24.2,44,w,29387.75,29387.75,1507.11,1507.11,612.25,894.86,0.0,0.0,0.0,Feb-2019,777.23,Mar-2019,Feb-2019,0,,1,Individual,,,,0,1208,321915,4,4,2,3,3.0,87153,88.0,4,5,998,57,50800,2,15,2,10,24763,13761.0,8.3,0,0,163.0,378,4,3,3,4.0,,4.0,,0,2,4,4,9,27,8,14,4,13,0.0,0,0,6,95.0,0.0,1,0,372872,99468,15000,94072,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0,Apr-2011,0,,,8,0,4599,19.1,13,w,4787.21,4787.21,353.89,353.89,212.79,141.1,0.0,0.0,0.0,Feb-2019,180.69,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,110299,0,1,0,2,14.0,7150,72.0,0,2,0,35,24100,1,5,0,4,18383,13800.0,0.0,0,0,87.0,92,15,14,2,77.0,,14.0,,0,0,3,3,3,4,6,7,3,8,0.0,0,0,0,100.0,0.0,0,0,136927,11749,13800,10000,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,985xx,WA,16.74,0,Feb-2006,0,,,10,0,5468,78.1,13,w,3831.93,3831.93,286.71,286.71,168.07,118.64,0.0,0.0,0.0,Feb-2019,146.51,Mar-2019,Feb-2019,0,,1,Individual,,,,0,686,305049,1,5,3,5,5.0,30683,68.0,0,0,3761,70,7000,2,4,3,5,30505,1239.0,75.2,0,0,62.0,154,64,5,3,64.0,,5.0,,0,1,2,1,2,7,2,3,2,10,0.0,0,0,3,100.0,100.0,0,0,385183,36151,5000,44984,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,212xx,MD,26.35,0,Dec-2000,0,,,12,0,829,3.6,26,w,29339.02,29339.02,1423.21,1423.21,660.98,762.23,0.0,0.0,0.0,Feb-2019,731.78,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,116007,3,5,3,5,4.0,28845,89.0,2,4,516,54,23100,1,0,0,9,9667,8471.0,8.9,0,0,53.0,216,2,2,2,2.0,,13.0,,0,2,2,3,8,9,6,15,2,12,0.0,0,0,5,92.3,0.0,0,0,157548,29674,9300,32332,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,5550,5550,5550.0,36 months,15.02,192.45,C,C3,Director COE,10+ years,MORTGAGE,152500.0,Not Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,461xx,IN,37.94,0,Sep-2002,3,,,18,0,53854,48.1,44,w,5302.5,5302.5,377.95,377.95,247.5,130.45,0.0,0.0,0.0,Feb-2019,192.45,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,685749,1,7,2,3,4.0,131524,72.0,1,4,17584,58,111900,2,4,6,8,40338,23746.0,64.0,0,0,195.0,176,10,4,6,20.0,,3.0,,0,4,6,6,10,23,9,15,7,18,0.0,0,0,4,100.0,60.0,0,0,831687,185378,65900,203159,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,2000,2000,2000.0,36 months,17.97,72.28,D,D1,Account Manager,4 years,RENT,51000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,606xx,IL,2.4,0,Nov-2004,1,,,1,0,0,,9,w,1914.71,1914.71,141.56,141.56,85.29,56.27,0.0,0.0,0.0,Feb-2019,72.28,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,854,0,0,2,3,7.0,0,,0,1,0,100,0,0,0,1,4,854,,,0,0,169.0,40,23,7,0,,,1.0,,0,0,0,0,3,5,0,3,0,1,0.0,0,0,2,100.0,,0,0,854,854,0,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,6000,6000,6000.0,36 months,13.56,203.79,C,C1,Assistant Director,10+ years,RENT,65000.0,Source Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,460xx,IN,30.1,0,Nov-1997,0,,,19,0,38476,69.3,37,w,5864.01,5864.01,201.53,201.53,135.99,65.54,0.0,0.0,0.0,Feb-2019,208.31,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,91535,0,5,0,1,23.0,53059,87.0,0,2,9413,74,55500,1,2,0,3,5085,3034.0,90.8,0,0,169.0,253,13,13,1,14.0,,13.0,,0,7,12,8,10,15,14,20,12,19,0.0,0,0,0,100.0,85.7,0,0,117242,91535,33100,61742,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,
,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,Legal Assistant III,10+ years,MORTGAGE,53580.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,327xx,FL,21.16,0,Aug-1998,1,32.0,,8,0,8018,35.2,38,w,4786.79,4786.79,353.89,353.89,213.21,140.68,0.0,0.0,0.0,Feb-2019,180.69,Mar-2019,Feb-2019,0,45.0,1,Individual,,,,0,0,41882,5,2,5,5,3.0,33864,98.0,1,6,3132,73,22800,2,1,4,12,5235,13786.0,35.9,0,0,145.0,244,6,3,3,6.0,33.0,2.0,32.0,2,4,5,5,10,20,6,15,5,8,0.0,0,0,6,78.9,60.0,0,0,57426,41882,21500,34626,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,6000,6000,6000.0,36 months,14.47,206.44,C,C2,,< 1 year,OWN,300000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,068xx,CT,17.43,1,Apr-2002,1,17.0,,38,0,65950,49.8,58,w,5730.2,5730.2,405.64,405.64,269.8,135.84,0.0,0.0,0.0,Feb-2019,206.44,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,349502,1,4,1,3,7.0,39961,45.0,1,12,15926,48,132500,2,2,2,15,9197,38683.0,60.6,0,0,166.0,200,4,4,1,4.0,,4.0,17.0,0,16,20,19,26,9,33,48,20,38,0.0,0,0,2,100.0,26.3,0,0,477390,105911,98300,89600,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [3]:
# Create a view or table

temp_table_name = "Sample_data_leanding_club_csv"

df.createOrReplaceTempView(temp_table_name)

In [4]:
%sql

/* Query the created temp table in a SQL cell */

select * from `Sample_data_leanding_club_csv`

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0,Apr-2001,1,,45.0,9,1,4341,10.3,34,w,2386.02,2386.02,167.02,167.02,113.98,53.04,0.0,0.0,0.0,Feb-2019,84.92,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,16901,2,2,1,2,2.0,12560,69.0,2,7,2137,28,42000,1,11,2,9,1878,34360.0,5.9,0,0,140.0,212,1,1,0,1.0,,2.0,,0,2,5,3,3,16,7,18,5,9,0.0,0,0,3,100.0,0.0,1,0,60124,16901,36500,18124,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0,Jun-1987,0,71.0,75.0,13,1,12315,24.2,44,w,29387.75,29387.75,1507.11,1507.11,612.25,894.86,0.0,0.0,0.0,Feb-2019,777.23,Mar-2019,Feb-2019,0,,1,Individual,,,,0,1208,321915,4,4,2,3,3.0,87153,88.0,4,5,998,57,50800,2,15,2,10,24763,13761.0,8.3,0,0,163.0,378,4,3,3,4.0,,4.0,,0,2,4,4,9,27,8,14,4,13,0.0,0,0,6,95.0,0.0,1,0,372872,99468,15000,94072,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0,Apr-2011,0,,,8,0,4599,19.1,13,w,4787.21,4787.21,353.89,353.89,212.79,141.1,0.0,0.0,0.0,Feb-2019,180.69,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,110299,0,1,0,2,14.0,7150,72.0,0,2,0,35,24100,1,5,0,4,18383,13800.0,0.0,0,0,87.0,92,15,14,2,77.0,,14.0,,0,0,3,3,3,4,6,7,3,8,0.0,0,0,0,100.0,0.0,0,0,136927,11749,13800,10000,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,985xx,WA,16.74,0,Feb-2006,0,,,10,0,5468,78.1,13,w,3831.93,3831.93,286.71,286.71,168.07,118.64,0.0,0.0,0.0,Feb-2019,146.51,Mar-2019,Feb-2019,0,,1,Individual,,,,0,686,305049,1,5,3,5,5.0,30683,68.0,0,0,3761,70,7000,2,4,3,5,30505,1239.0,75.2,0,0,62.0,154,64,5,3,64.0,,5.0,,0,1,2,1,2,7,2,3,2,10,0.0,0,0,3,100.0,100.0,0,0,385183,36151,5000,44984,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,212xx,MD,26.35,0,Dec-2000,0,,,12,0,829,3.6,26,w,29339.02,29339.02,1423.21,1423.21,660.98,762.23,0.0,0.0,0.0,Feb-2019,731.78,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,116007,3,5,3,5,4.0,28845,89.0,2,4,516,54,23100,1,0,0,9,9667,8471.0,8.9,0,0,53.0,216,2,2,2,2.0,,13.0,,0,2,2,3,8,9,6,15,2,12,0.0,0,0,5,92.3,0.0,0,0,157548,29674,9300,32332,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,5550,5550,5550.0,36 months,15.02,192.45,C,C3,Director COE,10+ years,MORTGAGE,152500.0,Not Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,461xx,IN,37.94,0,Sep-2002,3,,,18,0,53854,48.1,44,w,5302.5,5302.5,377.95,377.95,247.5,130.45,0.0,0.0,0.0,Feb-2019,192.45,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,685749,1,7,2,3,4.0,131524,72.0,1,4,17584,58,111900,2,4,6,8,40338,23746.0,64.0,0,0,195.0,176,10,4,6,20.0,,3.0,,0,4,6,6,10,23,9,15,7,18,0.0,0,0,4,100.0,60.0,0,0,831687,185378,65900,203159,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,2000,2000,2000.0,36 months,17.97,72.28,D,D1,Account Manager,4 years,RENT,51000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,606xx,IL,2.4,0,Nov-2004,1,,,1,0,0,,9,w,1914.71,1914.71,141.56,141.56,85.29,56.27,0.0,0.0,0.0,Feb-2019,72.28,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,854,0,0,2,3,7.0,0,,0,1,0,100,0,0,0,1,4,854,,,0,0,169.0,40,23,7,0,,,1.0,,0,0,0,0,3,5,0,3,0,1,0.0,0,0,2,100.0,,0,0,854,854,0,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,6000,6000,6000.0,36 months,13.56,203.79,C,C1,Assistant Director,10+ years,RENT,65000.0,Source Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,460xx,IN,30.1,0,Nov-1997,0,,,19,0,38476,69.3,37,w,5864.01,5864.01,201.53,201.53,135.99,65.54,0.0,0.0,0.0,Feb-2019,208.31,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,91535,0,5,0,1,23.0,53059,87.0,0,2,9413,74,55500,1,2,0,3,5085,3034.0,90.8,0,0,169.0,253,13,13,1,14.0,,13.0,,0,7,12,8,10,15,14,20,12,19,0.0,0,0,0,100.0,85.7,0,0,117242,91535,33100,61742,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,
,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,Legal Assistant III,10+ years,MORTGAGE,53580.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,327xx,FL,21.16,0,Aug-1998,1,32.0,,8,0,8018,35.2,38,w,4786.79,4786.79,353.89,353.89,213.21,140.68,0.0,0.0,0.0,Feb-2019,180.69,Mar-2019,Feb-2019,0,45.0,1,Individual,,,,0,0,41882,5,2,5,5,3.0,33864,98.0,1,6,3132,73,22800,2,1,4,12,5235,13786.0,35.9,0,0,145.0,244,6,3,3,6.0,33.0,2.0,32.0,2,4,5,5,10,20,6,15,5,8,0.0,0,0,6,78.9,60.0,0,0,57426,41882,21500,34626,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
,,6000,6000,6000.0,36 months,14.47,206.44,C,C2,,< 1 year,OWN,300000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,068xx,CT,17.43,1,Apr-2002,1,17.0,,38,0,65950,49.8,58,w,5730.2,5730.2,405.64,405.64,269.8,135.84,0.0,0.0,0.0,Feb-2019,206.44,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,349502,1,4,1,3,7.0,39961,45.0,1,12,15926,48,132500,2,2,2,15,9197,38683.0,60.6,0,0,166.0,200,4,4,1,4.0,,4.0,17.0,0,16,20,19,26,9,33,48,20,38,0.0,0,0,2,100.0,26.3,0,0,477390,105911,98300,89600,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [5]:
df_s=df.select('loan_amnt','funded_amnt','term','int_rate','addr_state','grade','sub_grade','home_ownership','verification_status','emp_length','loan_status','annual_inc','purpose','delinq_2yrs','revol_util','dti','dti_joint','total_acc','num_tl_90g_dpd_24m')

In [6]:
df_s.describe().show()

In [7]:
from pyspark.sql.functions import regexp_replace,regexp_extract
from pyspark.sql.functions import col
df_s=df_s.withColumn("term_cleaned",regexp_replace(col("term"),"months",''))

In [8]:
df_s=df_s.withColumn("emplean_cleaned",regexp_extract(col("emp_length"),"\\d+",0))

In [9]:
df_s.select('term','term_cleaned','emp_length','emplean_cleaned').show()

In [10]:
df_s.printSchema()

In [11]:
# Create a view or table with the new rows

table_name = 'loanstatus_sel'

df_s.createOrReplaceTempView(table_name)

In [12]:
#Covariance and Correlation
df_s.stat.cov('loan_amnt','annual_inc')


In [13]:
df_s.stat.corr('loan_amnt','annual_inc')

In [14]:
%sql
select corr(loan_amnt,annual_inc) from loanstatus_sel

"corr(CAST(loan_amnt AS DOUBLE), annual_inc)"
0.2010322533791464


In [15]:
df_s.stat.crosstab('loan_status','grade').show()

In [16]:
# Frequency with more than .3 in Purpose and Grade column 
freq=df_s.stat.freqItems(['purpose','grade'],0.3)

In [17]:
freq.collect()

In [18]:
%sql
select purpose,count(*) as count from loanstatus_sel group by purpose order by count desc

purpose,count
debt_consolidation,70603
credit_card,34961
home_improvement,7512
other,7094
major_purchase,2303
medical,1499
small_business,1051
car,1037
house,823
vacation,802


In [19]:
from pyspark.sql.functions import count,min,max,mean,stddev_pop,avg

In [20]:
#As spark is a distributed framework the summary statistics can give only approximation
quantileProbs=[0.25,0.50,0.75,0.90]
relError=0.10
df_s.stat.approxQuantile('loan_amnt',quantileProbs,relError)

In [21]:
quantileProbs=[0.25,0.50,0.75,0.90]
relError=0.05
df_s.stat.approxQuantile('loan_amnt',quantileProbs,relError)

In [22]:
quantileProbs=[0.25,0.50,0.75,0.90]
relError=0.00
df_s.stat.approxQuantile('loan_amnt',quantileProbs,relError)

In [23]:
from pyspark.sql.functions import isnan,when,count,col
df_s.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in df_s.columns]).show()

In [24]:
#If Null are present
df_s=df_s.na.drop('all',subset=['loan_status'])

In [25]:
df_s.select('revol_util').describe().show()

In [26]:
# Function for Imputation
def fill_avg(df,colname):
  return df.select(colname).agg(avg(colname))

In [27]:
rev_avg=fill_avg(df_s,'revol_util')


In [28]:
from pyspark.sql.functions import lit

rev_avg=fill_avg(df_s,'revol_util').first()[0]
df_s=df_s.withColumn('rev_avg',lit(rev_avg))


In [29]:
df_s.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in df_s.columns]).show()

In [30]:
# Filling the Null values in revol_util with its Average value where a null is present
from pyspark.sql.functions import coalesce

df_s=df_s.withColumn('revol_util',coalesce(col('revol_util'),col('rev_avg')))

In [31]:
df_s=df_s.withColumn('dti_cleaned',coalesce(col('dti'),col('dti_joint')))

In [32]:
df_s.groupby('loan_status').count().show()

In [33]:
df_s.where(df_s.loan_status.isin(['In Grace Period','Charged Off','Late (16-30 days)','Late (31-120 days)'])).show()

In [34]:
df_s=df_s.withColumn('bad_loan',when (df_s.loan_status.isin(['In Grace Period','Charged Off','Late (16-30 days)','Late (31-120 days)'])
                     ,'Yes').otherwise('No'))


In [35]:
df_s.groupby('bad_loan').count().show()

In [36]:
# Filter Function
df_s.filter(df_s.bad_loan =='Yes').show()

In [37]:
df_s.printSchema()

In [38]:
df_s_final=df_s.drop('revol_util','dti','dti_joint')

In [39]:
df_s_final.printSchema()

In [40]:
# Create a view or table

permanent_table_name = "lc_loan_data1"

df_s.write.format('parquet').saveAsTable(permanent_table_name)