In [None]:
pip install pandas jupyter

Collecting pandas
  Using cached pandas-2.3.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting jupyter
  Using cached jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting notebook (from jupyter)
  Using cached notebook-7.4.3-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Using cached jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting ipywidgets (from jupyter)
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyterlab (from jupyter)
  Using cached jupyterlab-4.4.3-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Using cached async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting httpx>=0.25.0 (from jupyterlab->jupyter)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jinja2>=3.0.3 (from jupyterlab->jupyter)
  Using cache

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\Wangui\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\pandas\\tests\\frame\\methods\\test_replace.py'
Consider using the `--user` option or check the permissions.



In [None]:
# Section 1: Importing data
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
# Simulate 60 days of data
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
data = []
start_date = datetime(2025, 4, 1)
for i in range(1, 61):
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3–6 sales per day
        data.append({
            'id': random.randint(1000, 9999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23),
                                              minutes=random.randint(0, 59))).isoformat()
        })
df = pd.DataFrame(data)
df.to_csv('sales_data_large.csv', index=False)
df.head()

Unnamed: 0,id,customer,date,amount,last_updated
0,8435,BestBuy,2025-04-02,1292,2025-04-02T06:41:00
1,3971,Target,2025-04-02,298,2025-04-02T20:41:00
2,5303,eBay,2025-04-02,227,2025-04-02T13:04:00
3,3967,Target,2025-04-02,1115,2025-04-02T16:47:00
4,4883,Costco,2025-04-03,1726,2025-04-03T06:18:00


In [None]:
# Section 2: Full extraction
df_full = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()


Pulled 266 rows via full extraction.


Unnamed: 0,id,customer,date,amount,last_updated
0,7387,Costco,2025-04-02,1272,2025-04-02 11:11:00
1,1931,BestBuy,2025-04-02,1311,2025-04-02 07:38:00
2,5989,BestBuy,2025-04-02,1227,2025-04-02 14:58:00
3,7793,Target,2025-04-03,163,2025-04-03 04:15:00
4,9650,Target,2025-04-03,1013,2025-04-03 01:39:00


In [8]:
# Creating the tracking file
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 

In [None]:
# Section 3: Incremental exctraction
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()


Pulled 181 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
83,3382,Walmart,2025-04-20,164,2025-04-20 17:02:00
85,9025,Amazon,2025-04-21,1747,2025-04-21 20:04:00
86,5175,Amazon,2025-04-21,343,2025-04-21 20:50:00
87,8179,Costco,2025-04-21,1669,2025-04-21 13:03:00
88,7935,Amazon,2025-04-22,279,2025-04-22 13:09:00


In [10]:
# Updating the last_extraction.txt file
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-31 17:19:00


In [1]:
# Section 4: Transform Full Data
import pandas as pd

df_full = pd.read_csv('sales_data_large.csv')

# 1. Cleaning
df_full = df_full.drop_duplicates()

# 2. Enrichment
if 'quantity' in df_full.columns and 'unit_price' in df_full.columns:
    df_full['total_price'] = df_full['quantity'] * df_full['unit_price']

# 3. Structural
if 'date' in df_full.columns:
    df_full['date'] = pd.to_datetime(df_full['date']).dt.strftime('%Y-%m-%d')

df_full.to_csv('transformed_full.csv', index=False)
print("Transformed full data saved as transformed_full.csv")

Transformed full data saved as transformed_full.csv


In [3]:
# Section 5: Transform Incremental Data
df_incremental = pd.read_csv('sales_data_large.csv')
# 1. Cleaning
df_incremental = df_incremental.fillna(0)

# 2. Enrichment
if 'quantity' in df_incremental.columns and 'unit_price' in df_incremental.columns:
    df_incremental['total_price'] = df_incremental['quantity'] * df_incremental['unit_price']

# 3. Structural
if 'date' in df_incremental.columns:
    df_incremental['date'] = pd.to_datetime(df_incremental['date']).dt.strftime('%Y-%m-%d')

df_incremental.to_csv('transformed_incremental.csv', index=False)
print("Transformed incremental data saved as transformed_incremental.csv")

Transformed incremental data saved as transformed_incremental.csv
