In [None]:
# Import the file into a BigQuery table

import pandas as pd

# Read the file
df = pd.read_csv('gs://labs.roitraining.com/data-to-ai-workshop/fraud_data_raw.csv')


# Pull the data schema and data types to determine the required table schema.
print(df.dtypes)

In [None]:

# Using BQ Python client library to create the table.
from google.cloud import bigquery
client = bigquery.Client()

# Set dataset reference vars.
project_id = "qwiklabs-gcp-02-949c0486d822"
dataset_id = "dani_data_to_ai_workshop"
table_id = "fraud_data_raw"

# Create the dataset if it doesn't exist.
dataset_ref = bigquery.DatasetReference(project_id, dataset_id)
dataset = bigquery.Dataset(dataset_ref)
client.create_dataset(dataset, exists_ok=True)

# Define the schema
schema = [
    bigquery.SchemaField("Applicant_ID", "INTEGER"),
    bigquery.SchemaField("Age", "INTEGER"),
    bigquery.SchemaField("Employment_Status", "STRING"),
    bigquery.SchemaField("Income", "INTEGER"),
    bigquery.SchemaField("Number_of_Dependents", "INTEGER"),
    bigquery.SchemaField("Amount_Requested", "INTEGER"),
    bigquery.SchemaField("Previous_Assistance_Received", "BOOLEAN"),
    bigquery.SchemaField("Previous_Assistance_Date", "STRING"),
    bigquery.SchemaField("Supporting_Doc_Verified", "BOOLEAN"),
    bigquery.SchemaField("Application_Frequency_Last_Year", "INTEGER"),
    bigquery.SchemaField("IP_Address", "STRING"),
    bigquery.SchemaField("Device_Type", "STRING"),
    bigquery.SchemaField("Application_Date", "STRING"),
    bigquery.SchemaField("Fraudulent", "INTEGER"),
]

# Create the table if it doesn't exist.
table_ref = bigquery.TableReference(dataset_ref, table_id)
table = bigquery.Table(table_ref, schema=schema)
client.create_table(table, exists_ok=True)

# Load the data into the table.
job_config = bigquery.LoadJobConfig(
    schema=schema,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Skip the header row
)

job = client.load_table_from_uri(
    'gs://labs.roitraining.com/data-to-ai-workshop/fraud_data_raw.csv',
    table,
    location='US',
    job_config=job_config
)

try:
    job.result()  # Wait for the job to complete
    table = client.get_table(table_ref)
    print(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}")
except Exception as e:
    print(f"Error loading data: {e}")




In [None]:
# Feature engineering

import numpy as np
import pandas as pd


df = pd.read_csv('gs://labs.roitraining.com/data-to-ai-workshop/fraud_data_raw.csv')
df_transformed = df.copy()


# One-hot encode Employment_Status and Device_Type
df_transformed['Employment_Status'] = df_transformed['Employment_Status'].astype('category')
df_transformed['Device_Type'] = df_transformed['Device_Type'].astype('category')

df_transformed = pd.get_dummies(df_transformed, columns=['Employment_Status', 'Device_Type'])

# Break age fields into bins 
df_transformed['Age_Bin'] = pd.cut(df_transformed['Age'], bins=[0, 18, 24, 34, 44, 54, 64, 74, 100], labels=['0-18', '18-24', '24-34', '34-44', '44-54', '54-64', '64-74', '74+'])
# One-hot encode Age_Bin
df_transformed = pd.get_dummies(df_transformed, columns=['Age_Bin'])


# Create Income-to-Amount requested ratio
df_transformed['Income_to_Amount_Requested_Ratio'] = df_transformed['Income'] / df_transformed['Amount_Requested']


# Create a "Time_Since_Previous_Assistance" field against the current date
# Calculate the time difference as before
time_diff = pd.to_datetime('now') - pd.to_datetime(df_transformed['Previous_Assistance_Date'])
# Convert timedelta to days as a float 
df_transformed['Time_Since_Previous_Assistance_Days'] = time_diff.dt.total_seconds() / (24 * 60 * 60)

# Change all boolean fields to 0/1
boolean_fields = df_transformed.select_dtypes(include=['bool']).columns
df_transformed[boolean_fields] = df_transformed[boolean_fields].astype(int)

# Save the transformed data to another table called fraud_training_data
table_id = "fraud_training_data"

# Schema for bq is same as original table with concatenated new fields

transformed_schema = schema + [
    bigquery.SchemaField("Age_Bin_0-18", "INTEGER"),
    bigquery.SchemaField("Age_Bin_18-24", "INTEGER"),
    bigquery.SchemaField("Age_Bin_24-34", "INTEGER"),
    bigquery.SchemaField("Age_Bin_34-44", "INTEGER"),
    bigquery.SchemaField("Age_Bin_44-54", "INTEGER"),
    bigquery.SchemaField("Age_Bin_54-64", "INTEGER"),
    bigquery.SchemaField("Age_Bin_64-74", "INTEGER"),
    bigquery.SchemaField("Age_Bin_74+", "INTEGER"),
    bigquery.SchemaField("Employment_Status_Employed", "INTEGER"),
    bigquery.SchemaField("Employment_Status_Self-Employed", "INTEGER"),
    bigquery.SchemaField("Employment_Status_Unemployed", "INTEGER"),
    bigquery.SchemaField("Device_Type_Desktop", "INTEGER"),
    bigquery.SchemaField("Device_Type_Mobile", "INTEGER"),
    bigquery.SchemaField("Device_Type_Tablet", "INTEGER"),
    bigquery.SchemaField("Income_to_Amount_Requested_Ratio", "FLOAT"),
    bigquery.SchemaField("Time_Since_Previous_Assistance_Days", "FLOAT"),
]

# Make boolean fields integer to support 0/1 transformation
transformed_schema = [field for field in transformed_schema if field.name not in ['Previous_Assistance_Received', 'Supporting_Doc_Verified']]
transformed_schema = transformed_schema + [bigquery.SchemaField("Previous_Assistance_Received", "INTEGER"), bigquery.SchemaField("Supporting_Doc_Verified", "INTEGER")]

# Remove Employment_Status and Device_Type fields from bq schema
transformed_schema = [field for field in transformed_schema if field.name not in ['Employment_Status', 'Device_Type']]

client = bigquery.Client()


dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

# Create the dataset if it doesn't exist.
dataset = bigquery.Dataset(dataset_ref)
client.create_dataset(dataset, exists_ok=True)
# Create the table if it doesn't exist.
table_ref = bigquery.TableReference(dataset_ref, table_id)
table = bigquery.Table(table_ref, schema=transformed_schema)
client.create_table(table, exists_ok=True)

# Load the data into the table from pandas dataframe
job_config = bigquery.LoadJobConfig(
    schema=transformed_schema,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Skip the header row
)

job = client.load_table_from_dataframe(
    df_transformed,
    table,
    location='US',
    job_config=job_config
)

try:
    job.result()  # Wait for the job to complete
    table = client.get_table(table_ref)
    print(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}")
except Exception as e:
    print(f"Error loading data: {e}")

# Validate the table was created.
table = client.get_table(table_ref)
print(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}")


