# Timeframe Analysis

In this notebook we analyse the timestamp property of the dataset.
We want to briefly check if there are anomalies in the timeframe, e.g. are there days where no prompts were collected?


In [None]:
import pandas as pd
#df = pd.read_parquet('sources/metadata.parquet', engine='pyarrow')
df_large = pd.read_parquet('sources/metadata-large.parquet', engine='pyarrow')
df_large_unique_prompts = df_large.drop_duplicates(subset="prompt").copy()

# read excel_artist_names
import my_utils
excel_artist_names = my_utils.read_lines_as_list("sources/excel_artists_copy_paste_name.txt")
hundred_artist_names =  excel_artist_names[0:100]
assert len(hundred_artist_names) == 100
ten_artist_names = excel_artist_names[0:10]
assert len(ten_artist_names) == 10

## Data Collection Interval

In [None]:
print(df_large.timestamp.describe(datetime_is_numeric=True))

## Data Distribution over time

In [None]:
# Analysing by Day requires a new column

weekday_mapping = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

df_large["date"] = df_large.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')
df_large_unique_prompts["date"] = df_large_unique_prompts.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')

vc = df_large.date.value_counts(sort=False)
vc.sort_index(inplace=True)

unique_vc = df_large_unique_prompts.date.value_counts(sort=False)
unique_vc.sort_index(inplace=True)


import matplotlib.pyplot as plt
x_data = []
y_data_total = []
y_data_unique = []
for index, value in vc.items():
    x_data.append(index)
    y_data_total.append(value)
    
for index, value in unique_vc.items():
    y_data_unique.append(value)


fig, ax1 = plt.subplots(figsize=(20,6))
ax2 = ax1.twinx()
ax1.plot(x_data, y_data_total, 'g-')
ax2.plot(x_data, y_data_unique, 'b-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Total Prompts', color='g')
ax2.set_ylabel('Unique Prompts', color='b')
ax1.set_title('Timestamp Distribution by Day')

plt.show()

In [None]:
# Analysing by Hour requires a new column

weekday_mapping = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

df_large["date"] = df_large.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}\n{str(x.hour).zfill(2)}')
df_large_unique_prompts["date"] = df_large_unique_prompts.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}\n{str(x.hour).zfill(2)}')

vc = df_large.date.value_counts(sort=False)
vc.sort_index(inplace=True)

unique_vc = df_large_unique_prompts.date.value_counts(sort=False)
unique_vc.sort_index(inplace=True)


import matplotlib.pyplot as plt
x_data = []
y_data_total = []
y_data_unique = []
c = 0
for index, value in vc.items():
    x_data.append(index)
    #if c%12==0:
    #    x_data.append(index)
    #else:
    #    x_data.append("")
    c+=1
    y_data_total.append(value)
    
for index, value in unique_vc.items():
    y_data_unique.append(value)


fig, ax1 = plt.subplots(figsize=(20,6))
ax2 = ax1.twinx()
ax1.plot(x_data, y_data_total, 'g-')
ax2.plot(x_data, y_data_unique, 'b-')

every_nth = 24
offset = 9 #21
for n, label in enumerate(ax1.xaxis.get_ticklabels()):
    if (n + offset) % every_nth != 0:
        label.set_visible(False)

for n, label in enumerate(ax2.xaxis.get_ticklabels()):
    if (n + offset) % every_nth != 0:
        label.set_visible(False)

ax1.set_xlabel('Date and Hour')
ax1.set_ylabel('Total Prompts', color='g')
ax2.set_ylabel('Unique Prompts', color='b')
ax1.set_title('Timestamp Distribution by Hour')

plt.show()

## Average Prompt Length per day

In [None]:
weekday_mapping = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
df_large["date"] = df_large.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')
df_large_unique_prompts["date"] = df_large_unique_prompts.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')

df_large["prompt_length"] = df_large.prompt.apply(lambda x: len(x))
df_large_unique_prompts["prompt_length"] = df_large_unique_prompts.prompt.apply(lambda x: len(x))

length_date = df_large.groupby(["date"]).prompt_length.mean()
length_date_unique = df_large_unique_prompts.groupby(["date"]).prompt_length.mean()
#print(length_date)

import matplotlib.pyplot as plt
x_data = []
y_data_total = []
y_data_unique = []
for index, value in length_date.items():
    x_data.append(index)
    y_data_total.append(value)
    
for index, value in length_date_unique.items():
    y_data_unique.append(value)


fig, ax1 = plt.subplots(figsize=(20,6))
ax2 = ax1.twinx()
ax1.plot(x_data, y_data_total, 'g-')
ax2.plot(x_data, y_data_unique, 'b-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Total Prompts Average length', color='g')
ax2.set_ylabel('Unique Prompts average length', color='b')
ax1.set_title('Avg Prompt Length by Day')

plt.show()

## Number of Unique Users per Day

In [None]:
weekday_mapping = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
df_large["date"] = df_large.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')
df_large_unique_prompts["date"] = df_large_unique_prompts.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')


unique_users = df_large.groupby(["date"]).user_name.agg("unique")
unique_users_unique = df_large_unique_prompts.groupby(["date"]).user_name.agg("unique")


import matplotlib.pyplot as plt
x_data = []
y_data_total = []
y_data_unique = []

for index, value in unique_users.items():
    x_data.append(index)
    y_data_total.append(len(value))
    
for index, value in unique_users_unique.items():
    y_data_unique.append(len(value))

print(f'Number of unique users per day')
print(x_data)
print(y_data_total)
print(y_data_unique)

fig, ax1 = plt.subplots(figsize=(20,6))
ax2 = ax1.twinx()
ax1.plot(x_data, y_data_total, 'g-')
ax2.plot(x_data, y_data_unique, 'b-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Total Prompts Number of Users', color='g')
ax2.set_ylabel('Unique Prompts Number of Users', color='b')
ax1.set_title('num unique Users by Day')

plt.show()

The values are very similar, for this reason we cannot see two lines in the plot unless we really zoom in.

## Proportion of duplicates by day

As in which day has the highest amount/proportion of duplicates

In [None]:
print(f'Across the entire dataset {(df_large.shape[0] - df_large_unique_prompts.shape[0])/ df_large.shape[0]}% are duplicate prompts')
print(f'{df_large.shape[0]} prompts and {df_large_unique_prompts.shape[0]} deduplicated prompts')

weekday_mapping = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
df_large["date"] = df_large.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')
df_large_unique_prompts["date"] = df_large_unique_prompts.timestamp.apply(lambda x: f'{x.year}-{x.month}-{str(x.day).zfill(2)}\n{weekday_mapping[x.weekday()]}')


unique_prompts = df_large.groupby(["date"]).prompt.agg("unique")
prompts_amount = df_large.groupby(["date"]).prompt.agg("count")

import matplotlib.pyplot as plt
x_data = []
y_data_total = []
y_data_unique = []


for index, value in unique_prompts.items():
    x_data.append(index)
    # print(f'index {index} prompts_amount[index] {prompts_amount[index]}')
    y_data_total.append((prompts_amount[index] - len(value)) / prompts_amount[index] * 100 )


fig, ax1 = plt.subplots(figsize=(20,6))
ax1.plot(x_data, y_data_total, 'g-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Percentage of Duplicates', color='g')
ax1.set_title('Percentage Duplicates by Day')

plt.show()