<a href="https://colab.research.google.com/github/git-shashank-hp/Structured-ML-Credit-Card-Fraud-Detection-Project/blob/main/code_in_pyspark_ML_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import feature
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("Spark ML Example") \
    .getOrCreate()


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
file_path = '/content/gdrive/MyDrive/Colab Notebooks/CCDP/fraudTrain.csv'

In [4]:
# Reading CSV data into a Spark DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show first few rows of data
df.show(5)


+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+
|_c0|trans_date_trans_time|          cc_num|            merchant|     category|   amt|    first|   last|gender|              street|          city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat| merch_long|is_fraud|
+---+---------------------+----------------+--------------------+-------------+------+---------+-------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-----------+--------+
|  0|  2019-01-01 00:00:18|2703186189652095|fraud_Rippin, Kub...|     misc_net|  4.97| Jennifer|  Banks|     F|      561 Perry 

In [5]:
# Print schema to see column names and types
df.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)



In [6]:
row_count = df.count()
print(f"Number of rows: {row_count}")


Number of rows: 1296675


In [7]:
len(df.columns)

23

In [8]:
import math
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Initialize Spark session
spark = SparkSession.builder.appName("HaversineExample").getOrCreate()

# Haversine formula function to calculate distance between two latitudes and longitudes (in kilometers)
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Difference in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c
    return distance

# Register the Haversine function as a UDF (User Defined Function)
haversine_udf = udf(haversine, DoubleType())

# Assuming you have already loaded the dataset `df`
# If you have more columns, let's make sure that we keep them all

# Example to load your dataset into DataFrame (replace 'file_path' with the actual path)
# Reading CSV data into a Spark DataFrame
df1 = spark.read.csv(file_path, header=True, inferSchema=True)

# Apply the Haversine UDF to calculate the distance and keep all original columns
df_with_distance = df1.withColumn("distance_from_merchant",
                                 haversine_udf("lat", "long", "merch_lat", "merch_long"))

# Show the resulting DataFrame with the calculated distance, along with all original columns
df_with_distance.show(truncate=False)


+---+---------------------+-------------------+----------------------------------------+-------------+------+-----------+---------+------+------------------------------+------------------------+-----+-----+-------+------------------+--------+---------------------------------------------+----------+--------------------------------+----------+------------------+------------------+--------+----------------------+
|_c0|trans_date_trans_time|cc_num             |merchant                                |category     |amt   |first      |last     |gender|street                        |city                    |state|zip  |lat    |long              |city_pop|job                                          |dob       |trans_num                       |unix_time |merch_lat         |merch_long        |is_fraud|distance_from_merchant|
+---+---------------------+-------------------+----------------------------------------+-------------+------+-----------+---------+------+------------------------------+---

In [9]:
# Print schema to see column names and types
df_with_distance.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- distance_from_merchant: double (nullable = true)



In [10]:
# Drop unwanted columns from df_with_distance
df_cleaned = df_with_distance.drop('_c0', 'merchant', 'cc_num', 'first',
                                   'state', 'last', 'trans_num', 'unix_time',
                                   'street', 'city', 'lat', 'long', 'merch_lat',
                                   'merch_long', 'zip')

# Show the cleaned DataFrame with the remaining columns
df_cleaned.show(truncate=False)


+---------------------+-------------+------+------+--------+---------------------------------------------+----------+--------+----------------------+
|trans_date_trans_time|category     |amt   |gender|city_pop|job                                          |dob       |is_fraud|distance_from_merchant|
+---------------------+-------------+------+------+--------+---------------------------------------------+----------+--------+----------------------+
|2019-01-01 00:00:18  |misc_net     |4.97  |F     |3495    |Psychologist, counselling                    |1988-03-09|0       |78.59756848823062     |
|2019-01-01 00:00:44  |grocery_pos  |107.23|F     |149     |Special educational needs teacher            |1978-06-21|0       |30.212175719210443    |
|2019-01-01 00:00:51  |entertainment|220.11|M     |4154    |Nature conservation officer                  |1962-01-19|0       |108.20608258720067    |
|2019-01-01 00:01:16  |gas_transport|45.0  |M     |1939    |Patent attorney                         

In [11]:
from pyspark.sql.functions import to_timestamp

# Convert 'trans_date_trans_time' to timestamp
df_cleaned = df_cleaned.withColumn('trans_date_trans_time', to_timestamp('trans_date_trans_time'))

# Convert 'dob' to timestamp
df_cleaned = df_cleaned.withColumn('dob', to_timestamp('dob'))

# Show the updated DataFrame schema to verify the changes
df_cleaned.printSchema()

# Show the DataFrame to verify the conversions
df_cleaned.show(truncate=False)


root
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- distance_from_merchant: double (nullable = true)

+---------------------+-------------+------+------+--------+---------------------------------------------+-------------------+--------+----------------------+
|trans_date_trans_time|category     |amt   |gender|city_pop|job                                          |dob                |is_fraud|distance_from_merchant|
+---------------------+-------------+------+------+--------+---------------------------------------------+-------------------+--------+----------------------+
|2019-01-01 00:00:18  |misc_net     |4.97  |F     |3495    |Psychologist, counselling                    |1988-03-09 00:00:00|0       |78.5

In [12]:
from pyspark.sql import functions as F
from pyspark.sql.functions import year, month, dayofmonth, current_date

# Extract individual date-time components from 'trans_date_trans_time'
df_cleaned = df_cleaned.withColumn('year', year('trans_date_trans_time'))
df_cleaned = df_cleaned.withColumn('Trans_month', month('trans_date_trans_time'))

# Extract individual date-time components from 'dob'
df_cleaned = df_cleaned.withColumn('birth_year', year('dob'))

# Calculate age by subtracting birth year from the current year
current_year = current_date().substr(1, 4).cast('int')  # Extract current year as integer
df_cleaned = df_cleaned.withColumn('age', current_year - df_cleaned['birth_year'])

# Show the updated DataFrame with the new columns
df_cleaned.show(truncate=False)


+---------------------+-------------+------+------+--------+---------------------------------------------+-------------------+--------+----------------------+----+-----------+----------+---+
|trans_date_trans_time|category     |amt   |gender|city_pop|job                                          |dob                |is_fraud|distance_from_merchant|year|Trans_month|birth_year|age|
+---------------------+-------------+------+------+--------+---------------------------------------------+-------------------+--------+----------------------+----+-----------+----------+---+
|2019-01-01 00:00:18  |misc_net     |4.97  |F     |3495    |Psychologist, counselling                    |1988-03-09 00:00:00|0       |78.59756848823062     |2019|1          |1988      |36 |
|2019-01-01 00:00:44  |grocery_pos  |107.23|F     |149     |Special educational needs teacher            |1978-06-21 00:00:00|0       |30.212175719210443    |2019|1          |1978      |46 |
|2019-01-01 00:00:51  |entertainment|220.11|M

In [13]:
from pyspark.sql.functions import col

# Convert 'year' and 'Trans_month' columns to string (PySpark equivalent of 'object' in Pandas)
df_cleaned = df_cleaned.withColumn('year', col('year').cast('string'))
df_cleaned = df_cleaned.withColumn('Trans_month', col('Trans_month').cast('string'))

# Show the updated DataFrame with the new column types
df_cleaned.printSchema()
df_cleaned.show(truncate=False)


root
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- distance_from_merchant: double (nullable = true)
 |-- year: string (nullable = true)
 |-- Trans_month: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- age: integer (nullable = true)

+---------------------+-------------+------+------+--------+---------------------------------------------+-------------------+--------+----------------------+----+-----------+----------+---+
|trans_date_trans_time|category     |amt   |gender|city_pop|job                                          |dob                |is_fraud|distance_from_merchant|year|Trans_month|birth_year|age|
+---------------------+-------------+------+------+--------+----------------

In [14]:
# Drop specified columns from df_cleaned
df_cleaned = df_cleaned.drop('trans_date_trans_time', 'birth_year', 'dob', 'job')

# Show the cleaned DataFrame to verify the columns have been removed
df_cleaned.show(truncate=False)


+-------------+------+------+--------+--------+----------------------+----+-----------+---+
|category     |amt   |gender|city_pop|is_fraud|distance_from_merchant|year|Trans_month|age|
+-------------+------+------+--------+--------+----------------------+----+-----------+---+
|misc_net     |4.97  |F     |3495    |0       |78.59756848823062     |2019|1          |36 |
|grocery_pos  |107.23|F     |149     |0       |30.212175719210443    |2019|1          |46 |
|entertainment|220.11|M     |4154    |0       |108.20608258720067    |2019|1          |62 |
|gas_transport|45.0  |M     |1939    |0       |95.67323113819748     |2019|1          |57 |
|misc_pos     |41.96 |M     |99      |0       |77.5567436258178      |2019|1          |38 |
|gas_transport|94.63 |F     |2158    |0       |85.92264266264023     |2019|1          |63 |
|grocery_net  |44.54 |F     |2691    |0       |118.11977555909641    |2019|1          |31 |
|gas_transport|71.65 |M     |6018    |0       |12.766922541959126    |2019|1    

In [15]:
from pyspark.sql.functions import max

# Find the maximum value of 'city_pop' column
max_city_pop = df_cleaned.agg(max('city_pop')).collect()[0][0]

# Print the maximum value
print("Maximum value of 'city_pop':", max_city_pop)



Maximum value of 'city_pop': 2906700


In [16]:
from pyspark.sql import functions as F

# Define the population thresholds
rural_threshold = 1000000  # Population < 1 million
metropolitan_threshold = 5000000  # Population > 5 million

# Create a new 'city_type' column based on population
df_cleaned = df_cleaned.withColumn(
    'city_type',
    F.when(df_cleaned['city_pop'] < rural_threshold, '1')  # Cities with < 1M population
     .when((df_cleaned['city_pop'] >= rural_threshold) & (df_cleaned['city_pop'] < metropolitan_threshold), '2')  # Cities between 1M and 5M population
     .otherwise('3')  # Cities with > 5M population
)

'''
1 - rural
2 - subarban
3 - metropolitan
'''

# Show the updated DataFrame with the new 'city_type' column
df_cleaned.select('city_pop', 'city_type').show(truncate=False)


+--------+---------+
|city_pop|city_type|
+--------+---------+
|3495    |1        |
|149     |1        |
|4154    |1        |
|1939    |1        |
|99      |1        |
|2158    |1        |
|2691    |1        |
|6018    |1        |
|1472    |1        |
|151785  |1        |
|7297    |1        |
|1925    |1        |
|341043  |1        |
|589     |1        |
|899     |1        |
|4664    |1        |
|1078    |1        |
|4081    |1        |
|2518    |1        |
|124967  |1        |
+--------+---------+
only showing top 20 rows



In [17]:
# Drop specified columns from df_cleaned
df_cleaned = df_cleaned.drop('city_pop')

# Show the cleaned DataFrame to verify the columns have been removed
df_cleaned.show(truncate=False)

+-------------+------+------+--------+----------------------+----+-----------+---+---------+
|category     |amt   |gender|is_fraud|distance_from_merchant|year|Trans_month|age|city_type|
+-------------+------+------+--------+----------------------+----+-----------+---+---------+
|misc_net     |4.97  |F     |0       |78.59756848823062     |2019|1          |36 |1        |
|grocery_pos  |107.23|F     |0       |30.212175719210443    |2019|1          |46 |1        |
|entertainment|220.11|M     |0       |108.20608258720067    |2019|1          |62 |1        |
|gas_transport|45.0  |M     |0       |95.67323113819748     |2019|1          |57 |1        |
|misc_pos     |41.96 |M     |0       |77.5567436258178      |2019|1          |38 |1        |
|gas_transport|94.63 |F     |0       |85.92264266264023     |2019|1          |63 |1        |
|grocery_net  |44.54 |F     |0       |118.11977555909641    |2019|1          |31 |1        |
|gas_transport|71.65 |M     |0       |12.766922541959126    |2019|1   

In [18]:
from pyspark.sql.functions import col

# Convert 'amt' and 'distance_from_merchant' to integer (whole numbers)
df_cleaned = df_cleaned.withColumn('amt', col('amt').cast('int'))
df_cleaned = df_cleaned.withColumn('distance_from_merchant', col('distance_from_merchant').cast('int'))

# Show the updated DataFrame with the converted columns
df_cleaned.select('amt', 'distance_from_merchant').show(truncate=False)


+---+----------------------+
|amt|distance_from_merchant|
+---+----------------------+
|4  |78                    |
|107|30                    |
|220|108                   |
|45 |95                    |
|41 |77                    |
|94 |85                    |
|44 |118                   |
|71 |12                    |
|4  |25                    |
|198|74                    |
|24 |97                    |
|7  |106                   |
|71 |44                    |
|96 |25                    |
|7  |66                    |
|3  |97                    |
|327|87                    |
|341|87                    |
|63 |90                    |
|44 |84                    |
+---+----------------------+
only showing top 20 rows



In [19]:
# Show the cleaned DataFrame to verify the columns have been removed
df_cleaned.show(truncate=False)

+-------------+---+------+--------+----------------------+----+-----------+---+---------+
|category     |amt|gender|is_fraud|distance_from_merchant|year|Trans_month|age|city_type|
+-------------+---+------+--------+----------------------+----+-----------+---+---------+
|misc_net     |4  |F     |0       |78                    |2019|1          |36 |1        |
|grocery_pos  |107|F     |0       |30                    |2019|1          |46 |1        |
|entertainment|220|M     |0       |108                   |2019|1          |62 |1        |
|gas_transport|45 |M     |0       |95                    |2019|1          |57 |1        |
|misc_pos     |41 |M     |0       |77                    |2019|1          |38 |1        |
|gas_transport|94 |F     |0       |85                    |2019|1          |63 |1        |
|grocery_net  |44 |F     |0       |118                   |2019|1          |31 |1        |
|gas_transport|71 |M     |0       |12                    |2019|1          |77 |1        |
|misc_pos 

In [20]:
from pyspark.sql import functions as F

# Map 'M' to 1 and 'F' to 0 in the 'gender' column
df_cleaned = df_cleaned.withColumn(
    'gender',
    F.when(df_cleaned['gender'] == 'M', 1)
     .when(df_cleaned['gender'] == 'F', 0)
     .otherwise(df_cleaned['gender'])  # Keeps other values if there are any (optional)
)

# Show the updated DataFrame with the mapped 'gender' column
df_cleaned.select('gender').show(truncate=False)


+------+
|gender|
+------+
|0     |
|0     |
|1     |
|1     |
|1     |
|0     |
|0     |
|1     |
|0     |
|0     |
|1     |
|0     |
|1     |
|1     |
|1     |
|1     |
|0     |
|1     |
|1     |
|1     |
+------+
only showing top 20 rows



In [21]:
# Show data types of all columns
for col_name, dtype in df_cleaned.dtypes:
    print(f"Column: {col_name}, Type: {dtype}")


Column: category, Type: string
Column: amt, Type: int
Column: gender, Type: string
Column: is_fraud, Type: int
Column: distance_from_merchant, Type: int
Column: year, Type: string
Column: Trans_month, Type: string
Column: age, Type: int
Column: city_type, Type: string


In [22]:
from pyspark.sql.functions import col

# Convert 'gender' column to string
df_cleaned = df_cleaned.withColumn('gender', col('gender').cast('string'))

# Show the updated DataFrame with the 'gender' column as string
df_cleaned.select('gender').show(truncate=False)


+------+
|gender|
+------+
|0     |
|0     |
|1     |
|1     |
|1     |
|0     |
|0     |
|1     |
|0     |
|0     |
|1     |
|0     |
|1     |
|1     |
|1     |
|1     |
|0     |
|1     |
|1     |
|1     |
+------+
only showing top 20 rows



In [23]:
from pyspark.sql import functions as F

# Convert 'is_fraud' column to 'Fraud' and 'Not Fraud' labels
df_cleaned = df_cleaned.withColumn(
    'is_fraud',
    F.when(df_cleaned['is_fraud'] == '1', '1')
     .when(df_cleaned['is_fraud'] == '0', '0')
     .otherwise(df_cleaned['is_fraud'])  # Keep other values (if any)
)

# Show the updated DataFrame with the 'is_fraud' column as string
df_cleaned.select('is_fraud').show(truncate=False)


+--------+
|is_fraud|
+--------+
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
|0       |
+--------+
only showing top 20 rows



In [24]:
df_cleaned.printSchema()

root
 |-- category: string (nullable = true)
 |-- amt: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- distance_from_merchant: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- Trans_month: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city_type: string (nullable = false)



In [25]:
from pyspark.sql.types import StringType

# Extract categorical columns (columns with string type)
categorical_cols = [field.name for field in df_cleaned.schema.fields if isinstance(field.dataType, StringType)]

# Show the categorical columns
print(categorical_cols)


['category', 'gender', 'is_fraud', 'year', 'Trans_month', 'city_type']


In [69]:
from pyspark.ml.feature import StringIndexer

# List of columns to encode - Updated to 'Trans_month'
cols = ['Trans_month']

# Apply StringIndexer to each column in the list
for col in cols:
    indexer = StringIndexer(inputCol=col, outputCol=col + "_index")
    df_cleaned = indexer.fit(df_cleaned).transform(df_cleaned)

# Show the resulting DataFrame with encoded columns
df_cleaned.show()

IllegalArgumentException: requirement failed: Output column Trans_month_index already exists.

In [70]:
# Show the resulting DataFrame with encoded columns
df_cleaned.show()

+-------------+---+------+--------+----------------------+-----------+---+---------+-----------------+
|     category|amt|gender|is_fraud|distance_from_merchant|Trans_month|age|city_type|Trans_month_index|
+-------------+---+------+--------+----------------------+-----------+---+---------+-----------------+
|     misc_net|  4|     0|       0|                    78|          1| 36|    Rural|              5.0|
|  grocery_pos|107|     0|       0|                    30|          1| 46|    Rural|              5.0|
|entertainment|220|     1|       0|                   108|          1| 62|    Rural|              5.0|
|gas_transport| 45|     1|       0|                    95|          1| 57|    Rural|              5.0|
|     misc_pos| 41|     1|       0|                    77|          1| 38|    Rural|              5.0|
|gas_transport| 94|     0|       0|                    85|          1| 63|    Rural|              5.0|
|  grocery_net| 44|     0|       0|                   118|          1| 31

In [72]:
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Assuming df_cleaned is your DataFrame

# 1. Identify non-ordinal categorical columns (e.g., 'category')
non_ordinal_cols = ['category']

# 2. Apply One-Hot Encoding to non-ordinal categorical columns
indexers = []
encoders = []

for col in non_ordinal_cols:
    # StringIndexer to convert the categorical column to a numeric index
    indexer = StringIndexer(inputCol=col, outputCol=col + '_index')

    # OneHotEncoder to convert the indexed column to one-hot encoding
    encoder = OneHotEncoder(inputCol=col + '_index', outputCol=col + '_onehot')

    indexers.append(indexer)
    encoders.append(encoder)

# 3. Build a pipeline to apply both StringIndexer and OneHotEncoder
pipeline = Pipeline(stages=indexers + encoders)

# Fit and transform the pipeline on the data
df_transformed = pipeline.fit(df_cleaned).transform(df_cleaned)

# 4. Convert boolean columns to integers (True/False to 1/0)
# Assuming boolean columns are in boolean format, cast them to integers
boolean_cols = [col for col in df_cleaned.columns if isinstance(df_cleaned.select(col).head()[0], bool)]

for col in boolean_cols:
    df_transformed = df_transformed.withColumn(col, F.col(col).cast('integer'))

# Show the schema to confirm the changes
df_transformed.printSchema()

# Optionally, show the first few rows of the transformed DataFrame
df_transformed.show(5)


root
 |-- category: string (nullable = true)
 |-- amt: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- distance_from_merchant: integer (nullable = true)
 |-- Trans_month: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city_type: string (nullable = false)
 |-- Trans_month_index: double (nullable = false)
 |-- category_index: double (nullable = false)
 |-- category_onehot: vector (nullable = true)

+-------------+---+------+--------+----------------------+-----------+---+---------+-----------------+--------------+---------------+
|     category|amt|gender|is_fraud|distance_from_merchant|Trans_month|age|city_type|Trans_month_index|category_index|category_onehot|
+-------------+---+------+--------+----------------------+-----------+---+---------+-----------------+--------------+---------------+
|     misc_net|  4|     0|       0|                    78|          1| 36|    Rural|              5.0|          11.0|(

In [73]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

# Assuming 'df_cleaned' is already a PySpark DataFrame
# If df_cleaned is a pandas DataFrame, you can convert it using:
# df_cleaned = spark.createDataFrame(df_cleaned)

# Step 1: Assemble features into a single vector column (excluding 'is_fraud')
feature_columns = [col for col in df_cleaned.columns if col not in ['is_fraud', 'gender', 'city_type', 'category', 'category_onehot']]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df_cleaned)

# Step 2: Split the data into train and test sets (80% train, 20% test)
train_df, test_df = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Step 3: Standardize the features using StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(train_df)
train_df_scaled = scaler_model.transform(train_df)
test_df_scaled = scaler_model.transform(test_df)

# Step 4: Prepare the data for training
train_data = train_df_scaled.select('scaled_features', 'is_fraud').withColumnRenamed("scaled_features", "features")
test_data = test_df_scaled.select('scaled_features', 'is_fraud').withColumnRenamed("scaled_features", "features")

# Check the shapes of the training and test datasets
print("Training features shape : ", train_data.count(), len(feature_columns))
print("Test features shape : ", test_data.count(), len(feature_columns))
print("Training target shape : ", train_data.count())
print("Test target shape : ", test_data.count())


IllegalArgumentException: Data type string of column Trans_month is not supported.