In [1]:
pip install pandas jupyter

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing data
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
# Simulate 60 days of data
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
data = []
start_date = datetime(2025, 4, 1)
for i in range(1, 61):
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3–6 sales per day
        data.append({
            'id': random.randint(1000, 9999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23),
                                              minutes=random.randint(0, 59))).isoformat()
        })
df = pd.DataFrame(data)
df.to_csv('sales_data_large.csv', index=False)
df.head()

Unnamed: 0,id,customer,date,amount,last_updated
0,2226,Target,2025-04-02,1205,2025-04-02T18:32:00
1,2653,eBay,2025-04-02,1691,2025-04-02T15:50:00
2,3858,Amazon,2025-04-02,456,2025-04-02T03:02:00
3,1952,BestBuy,2025-04-02,671,2025-04-02T03:10:00
4,1728,BestBuy,2025-04-02,582,2025-04-02T04:18:00


In [3]:
# Full extraction
df_full = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()


Pulled 282 rows via full extraction.


Unnamed: 0,id,customer,date,amount,last_updated
0,2226,Target,2025-04-02,1205,2025-04-02 18:32:00
1,2653,eBay,2025-04-02,1691,2025-04-02 15:50:00
2,3858,Amazon,2025-04-02,456,2025-04-02 03:02:00
3,1952,BestBuy,2025-04-02,671,2025-04-02 03:10:00
4,1728,BestBuy,2025-04-02,582,2025-04-02 04:18:00


In [4]:
# Creating the tracking file
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 

In [5]:
# Incremental exctraction
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()


Pulled 190 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
90,3363,eBay,2025-04-20,1302,2025-04-20 13:33:00
92,5539,eBay,2025-04-20,1077,2025-04-20 21:35:00
93,5859,BestBuy,2025-04-20,1416,2025-04-20 16:37:00
95,9661,BestBuy,2025-04-21,1826,2025-04-21 14:41:00
96,4453,eBay,2025-04-21,1950,2025-04-21 06:50:00


In [6]:
# Updating the last_extraction.txt file
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-31 23:49:00


In [8]:
# Section 4: Transform Full Data
import pandas as pd

df_full = pd.read_csv('sales_data_large.csv')

# 1. Cleaning
df_full = df_full.drop_duplicates()

# 2. Enrichment
if 'quantity' in df_full.columns and 'unit_price' in df_full.columns:
    df_full['total_price'] = df_full['quantity'] * df_full['unit_price']

# 3. Structural
if 'date' in df_full.columns:
    df_full['date'] = pd.to_datetime(df_full['date']).dt.strftime('%Y-%m-%d')

df_full.to_csv('transformed_full.csv', index=False)
print("Transformed full data saved as transformed_full.csv")

Transformed full data saved as transformed_full.csv


In [10]:
# Section 5: Transform Incremental Data

# 1. Cleaning
df_incremental = df_incremental.fillna(0)

# 2. Enrichment
if 'quantity' in df_incremental.columns and 'unit_price' in df_incremental.columns:
    df_incremental['total_price'] = df_incremental['quantity'] * df_incremental['unit_price']

# 3. Structural
if 'date' in df_incremental.columns:
    df_incremental['date'] = pd.to_datetime(df_incremental['date']).dt.strftime('%Y-%m-%d')

df_incremental.to_csv('transformed_incremental.csv', index=False)
print("Transformed incremental data saved as transformed_incremental.csv")

Transformed incremental data saved as transformed_incremental.csv
