## use case 1: create dataset with specific criteria  

"create a dataframe in python with random sales for a period of 1 month. I want the Columns Name Surname, Age, Quantity, Budget, Date of Purchase and Store Name. Purchase date cannot be greater than today and men should be 70% of the dataset. Total rows 100."

In [3]:
%pip install --upgrade pip setuptools wheel
%pip install pandas numpy


Note: you may need to restart the kernel to use updated packages.
Collecting pandas
  Using cached pandas-2.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Sample data for names and stores
male_names = ['John', 'Michael', 'David', 'James', 'Robert', 'William', 'Richard']
female_names = ['Mary', 'Patricia', 'Linda', 'Barbara', 'Elizabeth', 'Jennifer', 'Maria']
surnames = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller']
stores = ['Store A', 'Store B', 'Store C', 'Store D']

num_rows = 100
num_men = int(num_rows * 0.7)
num_women = num_rows - num_men

# Generate gender list
genders = ['M'] * num_men + ['F'] * num_women
random.shuffle(genders)

# Generate data
names = [random.choice(male_names if g == 'M' else female_names) for g in genders]
surnames_list = [random.choice(surnames) for _ in range(num_rows)]
ages = np.random.randint(18, 65, size=num_rows)
quantities = np.random.randint(1, 10, size=num_rows)
budgets = np.round(np.random.uniform(10, 500, size=num_rows), 2)
today = datetime.today()
start_date = today - timedelta(days=30)
dates = [start_date + timedelta(days=random.randint(0, (today - start_date).days)) for _ in range(num_rows)]
store_names = [random.choice(stores) for _ in range(num_rows)]

# Create DataFrame
df = pd.DataFrame({
    'Name': names,
    'Surname': surnames_list,
    'Age': ages,
    'Quantity': quantities,
    'Budget': budgets,
    'Date of Purchase': [d.date() for d in dates],
    'Store Name': store_names
})

df

Unnamed: 0,Name,Surname,Age,Quantity,Budget,Date of Purchase,Store Name
0,Robert,Williams,42,4,277.36,2025-06-04,Store C
1,Barbara,Garcia,59,3,93.29,2025-06-09,Store B
2,James,Williams,50,2,388.44,2025-06-12,Store C
3,Richard,Johnson,38,6,478.46,2025-06-04,Store D
4,David,Williams,22,6,286.99,2025-07-01,Store A
...,...,...,...,...,...,...,...
95,Michael,Johnson,63,4,146.10,2025-06-16,Store B
96,Patricia,Smith,23,6,14.06,2025-06-29,Store C
97,Michael,Jones,38,9,81.85,2025-06-16,Store C
98,Linda,Williams,45,4,447.93,2025-06-08,Store C


In [8]:
from collections import Counter

# Validate the rules for the generated dataset


validation_results = {}

# 1. Check total rows
validation_results['total_rows'] = len(df) == 100

# 2. Check columns
expected_columns = {'Name', 'Surname', 'Age', 'Quantity', 'Budget', 'Date of Purchase', 'Store Name'}
validation_results['columns'] = set(df.columns) == expected_columns

# 3. Check men are 70% of dataset
gender_count = Counter(genders)
validation_results['men_70_percent'] = abs(gender_count['M'] / len(genders) - 0.7) < 0.01

# 4. Check purchase date is not greater than today
df_dates = pd.to_datetime(df['Date of Purchase'])
validation_results['dates_not_future'] = all(df_dates <= today.date())

# 5. Check date range is within 1 month
validation_results['dates_within_1_month'] = (df_dates.max() - df_dates.min()).days <= 31

validation_results

TypeError: Invalid comparison between dtype=datetime64[ns] and date

In [9]:
from collections import Counter

# Validate the rules for the generated dataset


validation_results = {}

# 1. Check total rows
validation_results['total_rows'] = len(df) == 100

# 2. Check columns
expected_columns = {'Name', 'Surname', 'Age', 'Quantity', 'Budget', 'Date of Purchase', 'Store Name'}
validation_results['columns'] = set(df.columns) == expected_columns

# 3. Check men are 70% of dataset
gender_count = Counter(genders)
validation_results['men_70_percent'] = abs(gender_count['M'] / len(genders) - 0.7) < 0.01

# 4. Check purchase date is not greater than today
df_dates = pd.to_datetime(df['Date of Purchase'])
today_ts = pd.Timestamp(datetime.now())  # ή σήμερα = pd.Timestamp.today()

validation_results['dates_not_future'] = (df_dates <= today_ts).all()

# 5. Check date range is within 1 month
validation_results['dates_within_1_month'] = (df_dates.max() - df_dates.min()).days <= 31

validation_results

{'total_rows': True,
 'columns': True,
 'men_70_percent': True,
 'dates_not_future': np.True_,
 'dates_within_1_month': True}

### notes/findings:
- pip install was required for the code in order to run
- caught the context and create the dataset
- when asked for validation script, did a logical werror which resulted in failure, needing manual adjustment.


## Case 2 - Data Manipulation


Now I want to add +50 in the quantities of the woman and - 50 from the quantities of the men. Also create a new column which shows the rate quantity/budget. Finally add the country in a new column after the store name. Note that The same name cannot be in 2 different countries  

In [10]:
df_orig = df.copy()

# Adjust quantities: +50 for women, -50 for men
df['Quantity'] = df['Quantity'] + [50 if g == 'F' else -50 for g in genders]

# Create rate column: Quantity / Budget
df['Rate'] = df['Quantity'] / df['Budget']

# Assign countries ensuring the same name is not in 2 different countries
countries = ['USA', 'Canada', 'UK', 'Germany']
name_to_country = {}
used_countries = {name: set() for name in set(df['Name'])}

assigned_countries = []
for name in df['Name']:
    if name not in name_to_country:
        # Assign a country not already used for this name
        available = [c for c in countries if c not in used_countries[name]]
        country = random.choice(available)
        name_to_country[name] = country
        used_countries[name].add(country)
    assigned_countries.append(name_to_country[name])

# Insert country column after 'Store Name'
df.insert(df.columns.get_loc('Store Name') + 1, 'Country', assigned_countries)

df.head()

Unnamed: 0,Name,Surname,Age,Quantity,Budget,Date of Purchase,Store Name,Country,Rate
0,Robert,Williams,42,-46,277.36,2025-06-04,Store C,Germany,-0.165849
1,Barbara,Garcia,59,53,93.29,2025-06-09,Store B,UK,0.568121
2,James,Williams,50,-48,388.44,2025-06-12,Store C,UK,-0.123571
3,Richard,Johnson,38,-44,478.46,2025-06-04,Store D,USA,-0.091962
4,David,Williams,22,-44,286.99,2025-07-01,Store A,Canada,-0.153315


In [11]:
# Validate the manipulation by comparing df and df_orig

validation_manipulation = {}

# 1. Check that for women, Quantity increased by 50; for men, decreased by 50
quantity_diff = df['Quantity'].values - df_orig['Quantity'].values
expected_diff = np.array([50 if g == 'F' else -50 for g in genders])
validation_manipulation['quantity_adjustment'] = np.array_equal(quantity_diff, expected_diff)

# 2. Check that Rate column is correct: Quantity / Budget
rate_correct = np.allclose(df['Rate'], df['Quantity'] / df['Budget'])
validation_manipulation['rate_column'] = rate_correct

# 3. Check that Country column exists and same name is not in 2 different countries
country_column_exists = 'Country' in df.columns
name_country_unique = df.groupby('Name')['Country'].nunique().max() == 1
validation_manipulation['country_column_exists'] = country_column_exists
validation_manipulation['name_country_unique'] = name_country_unique

validation_manipulation

{'quantity_adjustment': True,
 'rate_column': True,
 'country_column_exists': True,
 'name_country_unique': np.True_}

### notes/findings
- perfect implemnation, totally correct