In [1]:
import pandas as pd


You are a Product Analyst on the **Instagram** Stories team investigating story creation patterns. The team wants to understand the distribution of stories created by users daily. You will analyze user storytelling behavior to optimize engagement strategies.

In [2]:
# Load the transaction data
stories_data = pd.read_csv('stories_data.csv')

# Display the DataFrame
print(stories_data)


      user_id    story_date  story_count
0    user_001    2024-07-03          3.0
1    user_001    2024-07-03          3.0
2    user_001    2024-08-15          5.0
3    user_001    2024-09-10          0.0
4    user_001    2024-10-05         20.0
5    user_001    07/15/2024          2.0
6    user_002    2024-07-03          4.0
7    user_002    2024-07-04          3.0
8    user_002           NaN          6.0
9    user_002    2024-12-25          1.0
10   user_002    2025-01-15          7.0
11   user_002    2025-06-29         10.0
12   user_003    2024-07-10          2.0
13   user_003    2024-08-20          8.0
14   user_003    2024-08-20          8.0
15   user_003    2025-03-11          5.0
16        NaN    2025-03-12          3.0
17   USER_003    2025-04-01          4.0
18   user_004    2024-07-15          6.0
19   user_004    2024-09-30          7.0
20   user_004    2024/10/10          4.0
21   user_004    2024-11-11          3.0
22   user_004    2025-02-28         12.0
23   user_004   

### Question 1 of 3

Take a look at the data in the `story_date` column. Correct any data type inconsistencies in that column.

In [3]:
# Clean and standardize the `story_date` column in-place (leave other columns unchanged)

dates = stories_data['story_date']

# Parse mixed date formats; coerce invalid strings to NaT
cleaned = pd.to_datetime(dates, errors='coerce')

stories_data['story_date'] = cleaned

# Display the cleaned DataFrame
print(stories_data)


      user_id story_date  story_count
0    user_001 2024-07-03          3.0
1    user_001 2024-07-03          3.0
2    user_001 2024-08-15          5.0
3    user_001 2024-09-10          0.0
4    user_001 2024-10-05         20.0
5    user_001        NaT          2.0
6    user_002 2024-07-03          4.0
7    user_002 2024-07-04          3.0
8    user_002        NaT          6.0
9    user_002 2024-12-25          1.0
10   user_002 2025-01-15          7.0
11   user_002 2025-06-29         10.0
12   user_003 2024-07-10          2.0
13   user_003 2024-08-20          8.0
14   user_003 2024-08-20          8.0
15   user_003 2025-03-11          5.0
16        NaN 2025-03-12          3.0
17   USER_003 2025-04-01          4.0
18   user_004 2024-07-15          6.0
19   user_004 2024-09-30          7.0
20   user_004        NaT          4.0
21   user_004 2024-11-11          3.0
22   user_004 2025-02-28         12.0
23   user_004 2025-03-01          0.0
24   user_005 2024-08-01          1.0
25   user_00

### Question 2 of 3

Calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day.

In [4]:
# Lowercase user_id and drop duplicates, then compute per-user-per-day percentiles
tmp = stories_data[['user_id', 'story_date', 'story_count']].copy()
tmp['user_id'] = tmp['user_id'].astype('string').str.strip().str.lower()

# Remove exact duplicate rows
tmp = tmp.drop_duplicates(subset=['user_id', 'story_date', 'story_count'])

# Aggregate to user-day counts
daily = tmp.groupby(['user_id', 'story_date'], as_index=False)['story_count'].sum()

# Percentiles
percentiles = daily['story_count'].quantile([0.25, 0.50, 0.75])
percentiles.index = ['p25', 'p50', 'p75']
percentiles = percentiles.to_dict()

# Display the percentiles
print(percentiles)


{'p25': 3.0, 'p50': 5.0, 'p75': 9.25}


### Question 3 of 3

What percentage of users have had at least one day where they posted more than 10 stories on that day?

Think about how you might identify those users and calculate the percentage relative to the total user base.

In [5]:
# Percentage of users with at least one day > 10 stories
tmp = stories_data[['user_id', 'story_date', 'story_count']].copy()

# Normalize user_id casing and whitespace
tmp['user_id'] = tmp['user_id'].astype('string').str.strip().str.lower()

# Coerce counts to numeric and drop invalids
tmp['story_count'] = pd.to_numeric(tmp['story_count'], errors='coerce')
tmp = tmp.dropna(subset=['user_id', 'story_date', 'story_count'])

# Remove exact duplicate rows
tmp = tmp.drop_duplicates(subset=['user_id', 'story_date', 'story_count'])

# Aggregate per user-day (calendar day)
tmp['day'] = tmp['story_date'].dt.normalize()
daily = tmp.groupby(['user_id', 'day'], as_index=False)['story_count'].sum()

# Users with any day > 10 stories
users_with_gt10 = set(daily.loc[daily['story_count'] > 10, 'user_id'])

# Denominator: all unique users in the dataset (normalized)
all_users = set(tmp['user_id'])

pct_users = (len(users_with_gt10) / len(all_users) * 100) if all_users else 0.0

print({
    'users_with_gt10': len(users_with_gt10),
    'all_users': len(all_users),
    'percentage': round(pct_users, 2)
})

# Display the percentage of users with at least one day > 10 stories
print(f"Percentage of users with at least one day > 10 stories: {round(pct_users, 2)}%")


{'users_with_gt10': 5, 'all_users': 10, 'percentage': 50.0}
Percentage of users with at least one day > 10 stories: 50.0%
