In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Loan Analysis") \
    .getOrCreate()

# Load the dataset
file_path = '/FileStore/tables/loan.csv'  
loan_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [0]:
# 1. Number of loans in each category
loan_df.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
# 3. Number of people with income greater than 60,000
loan_df.filter(loan_df["Income"] > 60000).count()

Out[41]: 198

In [0]:
# 4. Number of people with 2 or more returned cheques and income less than 50,000
loan_df.filter((loan_df[" Returned Cheque"] >= 2) & (loan_df["Income"] < 50000)).count()

Out[46]: 137

In [0]:
# 6. Number of people with expenditure over 50,000 a month
loan_df.filter(loan_df["Expenditure"] > 50000).count()

Out[57]: 6

In [0]:
# 7. Number of members who are eligible for a credit card

loan_df.filter((loan_df["Income"] > 40000) & (loan_df["Expenditure"] >= 30000)).count()

Out[59]: 140

In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Credit Card Analysis") \
    .getOrCreate()

# Load the dataset
file_path = '/FileStore/tables/credit_card.csv'  # Replace with your file path
credit_card_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [0]:
# 1. Number of credit card users in Spain


credit_card_df.filter(credit_card_df["Geography"] == "Spain").count()

Out[70]: 2477

In [0]:

# 2. Number of members who are eligible and active in the bank

credit_card_df.filter((credit_card_df["CreditScore"] > 750) & (credit_card_df["IsActiveMember"] == True)).count()

Out[100]: 832

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, sum, count

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Transaction Analysis") \
    .getOrCreate()

# Load the dataset
file_path = '/FileStore/tables/txn.csv'  # Replace with your file path
txn_df = spark.read.csv(file_path, header=True, inferSchema=True)




In [0]:
#1. Maximum withdrawal amount in transactions

txn_df.agg(max(" WITHDRAWAL AMT ").alias("MaxWithdrawal")).show()


+-------------+
|MaxWithdrawal|
+-------------+
|4.594475464E8|
+-------------+



In [0]:
# # 2. Minimum withdrawal amount of an account
txn_df.groupBy("Account No").min(" WITHDRAWAL AMT ").show()

+-------------+---------------------+
|   Account No|min( WITHDRAWAL AMT )|
+-------------+---------------------+
|409000438611'|                  0.2|
|     1196711'|                 0.25|
|     1196428'|                 0.25|
|409000493210'|                 0.01|
|409000611074'|                120.0|
|409000425051'|                 1.25|
|409000405747'|                 21.0|
|409000493201'|                  2.1|
|409000438620'|                 0.34|
|409000362497'|                 0.97|
+-------------+---------------------+



In [0]:
# # 3. Maximum deposit amount of an account
txn_df.groupBy("Account No").max(" DEPOSIT AMT ").show()

+-------------+------------------+
|   Account No|max( DEPOSIT AMT )|
+-------------+------------------+
|409000438611'|          1.7025E8|
|     1196711'|             5.0E8|
|     1196428'|     2.119594422E8|
|409000493210'|             1.5E7|
|409000611074'|         3000000.0|
|409000425051'|             1.5E7|
|409000405747'|           2.021E8|
|409000493201'|         1000000.0|
|409000438620'|           5.448E8|
|409000362497'|             2.0E8|
+-------------+------------------+



In [0]:
# # 4. Minimum deposit amount of an account
txn_df.groupBy("Account No").min(" DEPOSIT AMT ").show()

+-------------+------------------+
|   Account No|min( DEPOSIT AMT )|
+-------------+------------------+
|409000438611'|              0.03|
|     1196711'|              1.01|
|     1196428'|               1.0|
|409000493210'|              0.01|
|409000611074'|            1320.0|
|409000425051'|               1.0|
|409000405747'|             500.0|
|409000493201'|               0.9|
|409000438620'|              0.07|
|409000362497'|              0.03|
+-------------+------------------+



In [0]:
# # 5. Sum of balance in every bank account
txn_df.groupBy("Account No").sum("BALANCE AMT").show()

+-------------+--------------------+
|   Account No|    sum(BALANCE AMT)|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
|409000362497'| -5.2860004792808E13|
+-------------+--------------------+



In [0]:
# 6. Number of transactions on each date
txn_df.groupBy("VALUE DATE").agg(count("*").alias("TransactionCount")).show()

+----------+----------------+
|VALUE DATE|TransactionCount|
+----------+----------------+
| 23-Dec-16|             143|
|  7-Feb-19|              98|
| 21-Jul-15|              80|
|  9-Sep-15|              91|
| 17-Jan-15|              16|
| 18-Nov-17|              53|
| 21-Feb-18|              77|
| 20-Mar-18|              71|
| 19-Apr-18|              71|
| 21-Jun-16|              97|
| 17-Oct-17|             101|
|  3-Jan-18|              70|
|  8-Jun-18|             223|
| 15-Dec-18|              62|
|  8-Aug-16|              97|
| 17-Dec-16|              74|
|  3-Sep-15|              83|
| 21-Jan-16|              76|
|  4-May-18|              92|
|  7-Sep-17|              94|
+----------+----------------+
only showing top 20 rows

