In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime

# Initialize Faker for generating fake data
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)

# Define the number of rows
num_rows = 20000

# Generate synthetic credit card fraud dataset
data = {
    "TransactionID": np.arange(1, num_rows + 1),
    "UserID": np.random.randint(1000, 5000, size=num_rows),
    "TransactionAmount": np.round(np.random.uniform(5, 5000, size=num_rows), 2),
    "TransactionDate": pd.date_range(start="2024-01-01", periods=num_rows, freq="T"),
    "TransactionType": np.random.choice(["Online", "POS", "ATM Withdrawal"], size=num_rows),
    "Merchant": [fake.company() for _ in range(num_rows)],
    "Location": [fake.city() for _ in range(num_rows)],
    "CardType": np.random.choice(["Visa", "MasterCard", "Amex", "Discover"], size=num_rows),
    "IsFraud": np.random.choice([0, 1], size=num_rows, p=[0.98, 0.02]),  # 2% fraud cases
}

df = pd.DataFrame(data)

# Introduce missing values randomly
for col in ["TransactionAmount", "TransactionType", "Merchant"]:
    df.loc[df.sample(frac=0.02).index, col] = np.nan

# Introduce duplicate rows
df = pd.concat([df, df.sample(frac=0.02)], ignore_index=True)

# Introduce inconsistent formats in the "Location" column
df.loc[df.sample(frac=0.01).index, "Location"] = df["Location"].apply(lambda x: x.lower())

current_date = datetime.now().strftime("%Y_%m_%d")

filename = f"credit_card_fraud_{current_date}.csv"

# Save to CSV
file_path = f"inputs/finance/{filename}"
df.to_csv(file_path, index=False)

# file_path


ModuleNotFoundError: No module named 'faker'

In [28]:
import boto3
from io import StringIO
import pandas as pd

In [24]:
aws_access_key="AKIAWCHVIBUCJRMK5VHK",
aws_secret_access_key="Ih00zkWcPcC4VHx5I2gfNsJycs27KFpw8hEW90kj"

In [33]:
s3 = boto3.client(
    "s3",
    aws_access_key_id="AKIAWCHVIBUCJRMK5VHK",
    aws_secret_access_key="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"
)

In [78]:
s3.put_object(Bucket="creditcardfraudcontainer", Key="bronze/credit_card_fraud.csv", Body="../inputs/finance/credit_card_fraud.csv")

{'ResponseMetadata': {'RequestId': '07K4F0DZY6MF0KC4',
  'HostId': 'I6BQabVfC6/og63m8QARjYzcN4Xep3hR/1gBcWj9Q3XAS8q8u3bU3Vaa1sZbUOIf+Dy1X1T+UhE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'I6BQabVfC6/og63m8QARjYzcN4Xep3hR/1gBcWj9Q3XAS8q8u3bU3Vaa1sZbUOIf+Dy1X1T+UhE=',
   'x-amz-request-id': '07K4F0DZY6MF0KC4',
   'date': 'Tue, 18 Feb 2025 16:33:25 GMT',
   'x-amz-version-id': 'qn5aEiMv8nDzd8CVs4jRpU7uixgWbt1t',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"bbfcd37eac985cef6bfd1b414e169ab3"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"bbfcd37eac985cef6bfd1b414e169ab3"',
 'ServerSideEncryption': 'AES256',
 'VersionId': 'qn5aEiMv8nDzd8CVs4jRpU7uixgWbt1t'}

In [19]:
for bucket in s3.list_buckets()["Buckets"]:
    print(bucket)

{'Name': 'cicd-tutorial-prod', 'CreationDate': datetime.datetime(2024, 11, 4, 17, 54, 37, tzinfo=tzutc())}
{'Name': 'cicd-tutorial-qa', 'CreationDate': datetime.datetime(2024, 11, 4, 17, 37, 14, tzinfo=tzutc())}
{'Name': 'cicd-tutorial-staging', 'CreationDate': datetime.datetime(2024, 11, 4, 17, 51, 27, tzinfo=tzutc())}
{'Name': 'creditcardfraudcontainer', 'CreationDate': datetime.datetime(2025, 2, 18, 10, 59, 34, tzinfo=tzutc())}


In [79]:
response = s3.list_objects_v2(Bucket="creditcardfraudcontainer")
for obj in response.get("Contents", []):
    print(obj["Key"])

bronze/
bronze/credit_card_fraud.csv
gold/
silver/


In [20]:
import logging
logger = logging.getLogger(__name__)

In [80]:
import boto3
import logging
import pandas as pd
from io import StringIO

# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AWS_S3:
    def __init__(self, bucketname: str, access_key: str, secret_key: str) -> None:
        self.bucketname = bucketname
        self.access_key = access_key
        self.secret_key = secret_key

        if not self.access_key or not self.secret_key:
            logger.error("Please provide access and secret keys to continue")
            raise ValueError("Missing AWS credentials")

        if not self.bucketname:
            logger.error("Please provide a valid bucket name")
            raise ValueError("Bucket name is required")

    def _connect_to_aws(self):
        """Create and return an S3 client"""
        try:
            s3 = boto3.client(
                "s3",
                aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key
            )
            logger.info("Successfully connected to AWS S3")
            return s3  # ✅ RETURN the S3 client
        except Exception as e:
            logger.error(f"Connection failed: {e}")
            raise e
    
    def upload_file_to_s3(self, s3_key: str, local_file_path: str) -> None:
        """Upload a file to S3"""
        s3 = self._connect_to_aws()  # ✅ Fix: Get S3 client
        
        try:
            with open(local_file_path, "rb") as file:
                s3.put_object(Bucket=self.bucketname, Key=s3_key, Body=file)
            logger.info(f"File {local_file_path} uploaded successfully to S3 as {s3_key}")
        except Exception as e:
            logger.error("File Upload Failed. Check credentials and permissions")
            raise e
    
    def read_small_data_from_s3(self, s3_key: str):
        """Read a small CSV file from S3"""
        s3 = self._connect_to_aws()  # ✅ Fix: Get S3 client
        
        try:
            obj = s3.get_object(Bucket=self.bucketname, Key=s3_key)  # ✅ Fix: Use `Key=`
            df = pd.read_csv(StringIO(obj["Body"].read().decode("utf-8")))
            logger.info(f"Successfully read data from S3: {s3_key}")
            return df
        except Exception as e:
            logger.error(f"Error occurred while reading data from AWS: {e}")
            raise e


In [81]:
obj = AWS_S3(
    bucketname="creditcardfraudcontainer",
    access_key=aws_access_key,
    secret_key=aws_secret_access_key
    )

In [None]:
obj.upload_file_to_s3(
    s3_key="bronze/credit_card_fraud.csv", 
    local_file_path='../inputs/finance/credit_card_fraud.csv'
    )

INFO:__main__:Successfully connected to AWS S3
ERROR:__main__:File Upload Failed. Check credentials and permissions


TypeError: sequence item 0: expected str instance, tuple found

In [66]:
import os
local_path = os.path.abspath("../inputs/finance/credit_card_fraud.csv")

In [67]:
local_path

'/Users/harendrakumar/Documents/kpmg/inputs/finance/credit_card_fraud.csv'