In [33]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
from pathlib import Path
import shutil
import glob
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

spark = SparkSession.builder.appName("faker").getOrCreate()

In [38]:
# Initialize Faker for generating fake data
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)

# Define the number of rows
num_rows = 14300

# Generate synthetic credit card fraud dataset
data = {
    "TransactionID": np.arange(1, num_rows + 1),
    "UserID": np.random.randint(1000, 5000, size=num_rows),
    "TransactionAmount": np.round(np.random.uniform(5, 5000, size=num_rows), 2),
    "TransactionDate": pd.date_range(start="2024-01-01", periods=num_rows, freq="T"),
    "TransactionType": np.random.choice(["Online", "POS", "ATM Withdrawal"], size=num_rows),
    "Merchant": [fake.company() for _ in range(num_rows)],
    "Location": [fake.city() for _ in range(num_rows)],
    "CardType": np.random.choice(["Visa", "MasterCard", "Amex", "Discover"], size=num_rows),
    "IsFraud": np.random.choice([0, 1], size=num_rows, p=[0.98, 0.02]),  # 2% fraud cases
}

df = pd.DataFrame(data)

# Introduce missing values randomly
for col in ["TransactionAmount", "TransactionType", "Merchant", "TransactionDate", "CardType"]:
    df.loc[df.sample(frac=0.03).index, col] = np.nan

# Introduce missing values randomly
for col in [ "Merchant", "TransactionDate", "CardType"]:
    df.loc[df.sample(frac=0.02).index, col] = np.nan

# Introduce duplicate rows
df = pd.concat([df, df.sample(frac=0.04)], ignore_index=True)

# Introduce inconsistent formats in the "Location" column
df.loc[df.sample(frac=0.01).index, "Location"] = df["Location"].apply(lambda x: x.lower())

current_date = datetime.now().strftime("%Y%m%d")

filename = f"{current_date}.csv"
print(f"file: {filename}")

# Save to CSV
file_path = f"../inputs/finance/{filename}"
df.to_csv(file_path, index=False)

# file_path


file: 20250222.csv


In [None]:
file_records = {}
for file in glob.glob(pathname="../archive/*.csv"):
    filename = file.split("/")[2].split(".")[0]
    count = spark.read.csv(file).count()
    file_records[filename] = count

file_records
    

In [None]:
yesterday = datetime.today() - timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
print(yesterday)

if Path(f"../inputs/finance/{yesterday}.csv").exists():
    print(f"{yesterday} exists")
    


In [None]:
folder_path = Path(f"../inputs/finance/{current_date}")

if not folder_path.exists():
    folder_path.mkdir(parents=True, exist_ok=True)
    print(f"Folder {folder_path} created")
else:
    print(f"Folder {folder_path} already exists")

In [None]:
list_of_files = glob.glob("../inputs/finance/*")
latest_file = max(list_of_files, key=os.path.getctime)

for file in list_of_files:
    if file != latest_file:
        shutil.move(src=file, dst="archive/")

In [None]:
def move_old_file_to_archive():
    archive_path = Path(f"../archive/")
    input_container = Path("../inputs/finance")

    if not archive_path.exists():
        archive_path.mkdir(parents=True, exist_ok=True)
    else:
        print(f"folder {archive_path} exists")
    
    list_of_files = glob.glob(f"{input_container}/*")
    latest_file = max(list_of_files, key=os.path.getctime)

    for file in list_of_files:
        if file != latest_file:
            shutil.move(src=file, dst=archive_path)
            print(f"{file} moved successfully to {archive_path}")
        else:
            print("Path does not exists")
    


In [None]:
move_old_file_to_archive()

In [31]:
def get_past_records():
    file_records = {}
    for file in glob.glob(pathname="../archive/*.*"):
        filename = file.split("/")[2].split(".")[0]
        print(filename)
        count = spark.read.csv(file).count()
        file_records[filename] = count
    
    schema = StructType(fields=
                            [
                            StructField("Date", StringType(), False),
                            StructField("Value", IntegerType(), False)
                            ]
                            )

    df = spark.createDataFrame([(k, v) for k, v in file_records.items()], schema)
    df = df.withColumn("Date", to_date(col("Date"), format="yyyyMMdd"))
    df = df.orderBy(col("Date").desc())
    pd_df = df.toPandas()
    # pd_df.to_csv("../artifacts/data_records.csv")
    return pd_df

In [32]:
new_df = get_past_records()
# new_df["Date"] = pd.to_datetime(new_df["Date"], format="%Y-%m-%d")
new_df.head()

20250219
20250218
20250220
20250221
20250216
20250217
20250215
20250214


Unnamed: 0,Date,Value
0,2025-02-21,29641
1,2025-02-20,20401
2,2025-02-19,15601
3,2025-02-18,18721
4,2025-02-17,19761


In [13]:
new_df.head()

Row(Date='archive', Value=5201)

In [None]:
# check for archive/20250220 path exist
archive_path = Path(f"../archive/{yesterday}/")

old_file_path = Path(f"../inputs/finance/{yesterday}.csv")

if not archive_path.exists():
    archive_path.mkdir(parents=True, exist_ok=True)
else:
    print(f"folder {archive_path} exists")

if old_file_path.exists():
    shutil.move(src=old_file_path, dst=archive_path)
    print("file moved successfully")
else:
    print("Path does not exists")
    

In [None]:
if Path(f"../inputs/finance/{yesterday}.csv").exists():
    print(f"{yesterday} exists")

In [None]:
df2 = pd.read_csv("../inputs/finance/20250221.csv")
df2.head()

In [None]:
source_path = Path(f"../inputs/finance/{yesterday}.csv")
destination_path = archive_path / f"{yesterday}.csv"

if source_path.exists():
    source_path.rename(destination_path)
    print(f"File moved to {destination_path}")
else:
    print(f"Source file {source_path} does not exist")

In [None]:
datetime.today().strftime("%Y%m%d")

In [None]:
import boto3
from io import StringIO
import pandas as pd

In [None]:
aws_access_key="",
aws_secret_access_key=""

In [None]:
s3 = boto3.client(
    "s3",
    aws_access_key_id="",
    aws_secret_access_key=""
)

In [None]:
s3.put_object(Bucket="creditcardfraudcontainer", Key="bronze/credit_card_fraud.csv", Body="../inputs/finance/credit_card_fraud.csv")

In [None]:
for bucket in s3.list_buckets()["Buckets"]:
    print(bucket)

In [None]:
response = s3.list_objects_v2(Bucket="creditcardfraudcontainer")
for obj in response.get("Contents", []):
    print(obj["Key"])

In [None]:
import logging
logger = logging.getLogger(__name__)

In [None]:
import boto3
import logging
import pandas as pd
from io import StringIO

# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AWS_S3:
    def __init__(self, bucketname: str, access_key: str, secret_key: str) -> None:
        self.bucketname = bucketname
        self.access_key = access_key
        self.secret_key = secret_key

        if not self.access_key or not self.secret_key:
            logger.error("Please provide access and secret keys to continue")
            raise ValueError("Missing AWS credentials")

        if not self.bucketname:
            logger.error("Please provide a valid bucket name")
            raise ValueError("Bucket name is required")

    def _connect_to_aws(self):
        """Create and return an S3 client"""
        try:
            s3 = boto3.client(
                "s3",
                aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key
            )
            logger.info("Successfully connected to AWS S3")
            return s3  # ✅ RETURN the S3 client
        except Exception as e:
            logger.error(f"Connection failed: {e}")
            raise e
    
    def upload_file_to_s3(self, s3_key: str, local_file_path: str) -> None:
        """Upload a file to S3"""
        s3 = self._connect_to_aws()  # ✅ Fix: Get S3 client
        
        try:
            with open(local_file_path, "rb") as file:
                s3.put_object(Bucket=self.bucketname, Key=s3_key, Body=file)
            logger.info(f"File {local_file_path} uploaded successfully to S3 as {s3_key}")
        except Exception as e:
            logger.error("File Upload Failed. Check credentials and permissions")
            raise e
    
    def read_small_data_from_s3(self, s3_key: str):
        """Read a small CSV file from S3"""
        s3 = self._connect_to_aws()  # ✅ Fix: Get S3 client
        
        try:
            obj = s3.get_object(Bucket=self.bucketname, Key=s3_key)  # ✅ Fix: Use `Key=`
            df = pd.read_csv(StringIO(obj["Body"].read().decode("utf-8")))
            logger.info(f"Successfully read data from S3: {s3_key}")
            return df
        except Exception as e:
            logger.error(f"Error occurred while reading data from AWS: {e}")
            raise e


In [None]:
obj = AWS_S3(
    bucketname="creditcardfraudcontainer",
    access_key=aws_access_key,
    secret_key=aws_secret_access_key
    )

In [None]:
obj.upload_file_to_s3(
    s3_key="bronze/credit_card_fraud.csv", 
    local_file_path='../inputs/finance/credit_card_fraud.csv'
    )

In [None]:
import os
local_path = os.path.abspath("../inputs/finance/credit_card_fraud.csv")

In [None]:
local_path