In [86]:
import pandas as pd
import numpy as np

# Section 1: Data Preparation
# Task: Create a synthetic dataset with a mix of numerical, categorical, and datetime data.
data = {
    'ID': range(1, 101),
    'Category': np.random.choice(['A', 'B', 'C'], size=100),
    'Value': np.random.uniform(10, 100, size=100),
    'Date': pd.date_range(start='2023-01-01', periods=100)
}

df = pd.DataFrame(data)
print("Sample Data:")
print(df.head())

# Section 2: Data Transformation
# Task: Perform the following transformations:
# 1. Add a new column that categorizes 'Value' into bins: Low (<30), Medium (30-70), High (>70).
# 2. Create a pivot table showing the average 'Value' for each 'Category' and 'Date'.

# Adding the 'Value_Category' column
def categorize_value(value):
    if value < 30:
        return 'Low'
    elif value <= 70:
        return 'Medium'
    else:
        return 'High'

df['Value_Category'] = df['Value'].apply(categorize_value)

# Creating the pivot table
pivot_table = df.pivot_table(
    values='Value', 
    index='Category', 
    columns=df['Date'].dt.month, 
    aggfunc='mean', 
    fill_value=0
)

print("\nPivot Table:")
print(pivot_table)

# Section 3: Advanced Data Analysis
# Task: Perform advanced analysis to:
# 1. Identify the top 3 dates with the highest average 'Value' for each category.
# 2. Calculate the cumulative sum of 'Value' for each category over time.

# Top 3 dates with highest average 'Value' for each category
top_dates = df.groupby(['Category', 'Date'])['Value'].mean().reset_index()
top_dates = top_dates.sort_values(['Category', 'Value'], ascending=[True, False])
top_3_dates = top_dates.groupby('Category').head(3)

print("\nTop 3 Dates with Highest Average Value for Each Category:")
print(top_3_dates)

# Cumulative sum of 'Value' for each category
df['Cumulative_Value'] = df.groupby('Category')['Value'].cumsum()

print("\nData with Cumulative Sum:")
print(df[['ID', 'Category', 'Value', 'Cumulative_Value']].head(10))

Sample Data:
   ID Category      Value       Date
0   1        A  69.151448 2023-01-01
1   2        C  85.727726 2023-01-02
2   3        A  40.749970 2023-01-03
3   4        C  72.481293 2023-01-04
4   5        B  97.692348 2023-01-05

Pivot Table:
Date              1          2          3          4
Category                                            
A         53.077450  48.999103  58.874453  54.152560
B         60.937466  67.424459  53.267355  13.062362
C         63.673092  52.781301  51.000085  50.374148

Top 3 Dates with Highest Average Value for Each Category:
   Category       Date      Value
9         A 2023-02-14  96.310907
27        A 2023-03-27  92.960253
2         A 2023-01-13  90.298900
33        B 2023-01-06  98.377581
32        B 2023-01-05  97.692348
48        B 2023-02-21  88.081276
93        C 2023-03-28  96.053085
64        C 2023-01-15  93.901245
91        C 2023-03-21  93.473617

Data with Cumulative Sum:
   ID Category      Value  Cumulative_Value
0   1        A  

**Section 1: Data Preparation**  
**Task: Create a synthetic dataset with a mix of numerical, categorical, and datetime data.**

In [96]:
import random
from datetime import datetime, timedelta
import pandas as pd

n = 100
id = []
gender = []
gender_option = ['M', 'F', 'U']
date = []
s_date = datetime(2024, 1, 1)  
e_date = datetime(2024, 12, 25) 

def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)  
    return start + timedelta(days=random_days)

for i in range (n):
    id.append(i+1)
    gender.append(random.choice(gender_option))
    date.append(random_date(s_date, e_date).date())

df = pd.DataFrame({
    'id': id,
    'gender': gender,
    'date': [d.strftime('%Y/%m/%d') for d in date]
})

df.head(5)

Unnamed: 0,id,gender,date
0,1,U,2024/06/23
1,2,U,2024/10/25
2,3,M,2024/02/09
3,4,M,2024/01/22
4,5,U,2024/07/14


**Section 2: Data Transformation**  
**Task: Perform the following transformations:**  
**1. Add a new column that categorizes 'Value' into bins: Low (<30), Medium (30-70), High (>70).**  
**2. Create a pivot table showing the average 'Value' for each 'Category' and 'Date'.**  

**Adding the 'Value_Category' column**

In [97]:
df['Values'] = [random.randint(1, 100) for _ in range(n)]
df['Classification'] = pd.cut(df['Values'], bins=[0, 30, 70, 100], labels=['Low', 'Medium', 'High'])

In [98]:
df.head(5)

Unnamed: 0,id,gender,date,Values,Classification
0,1,U,2024/06/23,99,High
1,2,U,2024/10/25,16,Low
2,3,M,2024/02/09,100,High
3,4,M,2024/01/22,40,Medium
4,5,U,2024/07/14,47,Medium


In [100]:
pivot_table = pd.pivot_table(
    df,
    values="Values",
    index="date",
    columns="gender",
    aggfunc="mean"
)

pivot_table = pivot_table.fillna(0)

pivot_table

gender,F,M,U
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024/01/10,0.0,92.0,18.0
2024/01/11,0.0,0.0,20.0
2024/01/13,0.0,0.0,91.0
2024/01/14,24.0,0.0,0.0
2024/01/15,0.0,52.0,0.0
...,...,...,...
2024/12/07,0.0,0.0,30.0
2024/12/09,0.0,0.0,4.0
2024/12/15,0.0,81.0,0.0
2024/12/18,0.0,32.0,0.0


**Section 3: Advanced Data Analysis**  
**Task: Perform advanced analysis to:**  
**1. Identify the top 3 dates with the highest average 'Value' for each category.**  
**2. Calculate the cumulative sum of 'Value' for each category over time.**  

In [101]:
top_3_dates = {
    gender: pivot_table[gender].nlargest(3)
    for gender in pivot_table.columns
}

top_3_dates

{'F': date
 2024/11/08    98.0
 2024/09/11    97.0
 2024/08/31    91.0
 Name: F, dtype: float64,
 'M': date
 2024/02/09    100.0
 2024/06/18    100.0
 2024/11/13     99.0
 Name: M, dtype: float64,
 'U': date
 2024/06/23    99.0
 2024/07/20    97.0
 2024/06/29    95.0
 Name: U, dtype: float64}

In [102]:
cumulative_sums = pivot_table.cumsum()
cumulative_sums

gender,F,M,U
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024/01/10,0.0,92.0,18.0
2024/01/11,0.0,92.0,38.0
2024/01/13,0.0,92.0,129.0
2024/01/14,24.0,92.0,129.0
2024/01/15,24.0,144.0,129.0
...,...,...,...
2024/12/07,1507.0,1759.5,1623.0
2024/12/09,1507.0,1759.5,1627.0
2024/12/15,1507.0,1840.5,1627.0
2024/12/18,1507.0,1872.5,1627.0
