In [48]:
import pandas as pd

In [49]:
df = pd.read_csv("../data_source/VideoConviction - with final refined prices.csv")

### Cleaning

In [50]:
# Check if 'Unnamed: 0' exists in the DataFrame columns and drop it if present
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

### All column names

In [51]:
# Print column names in a pretty format
column_names = "\n".join(df.columns)
print("df Columns:\n" + "-"*20 + "\n" + column_names)

df Columns:
--------------------
id
derived_inner_id
video_id
start
end
action
action_source
conviction_score
ticker_name
action_date
price
quantity
video_title
annotation_id
is_rec_present
original_inner_id
original_video_title
publishedAt
channelId
channelTitle
videoDescription
tags
defaultAudioLanguage
duration
isCaptionAvailable
viewCount
likeCount
favoriteCount
commentCount
comments
channelDescription
channelViewCount
channelSubscriberCount
videoCount
channelCategory
transcript
youtube_video_url
segment_transcript


### Check 1: If all compulsory column values are present

In [52]:
compulsory_fields = ["start", "end", "action", "action_source", "conviction_score", "ticker_name", "video_title"]

In [53]:
# Find rows where any of the specified columns have empty (NaN) values
filtered_df = df[df['is_rec_present'] == 'Yes']
empty_rows = filtered_df[filtered_df[compulsory_fields].isna().any(axis=1)]

# Print row numbers (index + 1 for human-readable row numbers)
for index in empty_rows.index:
    print(f"Row {index + 1} has empty values in the specified columns.")

### Metric: Number of Annotated datapoints

In [54]:
annotated_columns = ["start", "end", "action", "action_source", "conviction_score", "ticker_name",
                      "action_date", "price", "quantity", "video_title", "is_rec_present"]

In [55]:
df.shape

(760, 38)

In [56]:
# For counts per column
non_nan_counts = df[annotated_columns].notna().sum()
print(non_nan_counts)

# For total count across all columns
total_non_nan = df[annotated_columns].notna().sum().sum()
print("Total non-NaN values:", total_non_nan)

start               687
end                 687
action              687
action_source       687
conviction_score    687
ticker_name         687
action_date          19
price               395
quantity              7
video_title         760
is_rec_present      760
dtype: int64
Total non-NaN values: 6063


### Metric: Number of Total Datapoints (excluding ids)

In [57]:
# Define the columns you want to exclude
exclude_columns = [
    "derived_inner_id", 
    "annotation_id", 
    #"annotator", 
    "original_inner_id", 
    "original_video_title"
]

# Drop the excluded columns
df_filtered = df.drop(columns=exclude_columns)

# Count non-NaN values for each column in the filtered DataFrame
column_counts = df_filtered.count()

# Print the non-NaN count for each column
print("Non-NaN counts for each column:")
print(column_counts)

# Calculate the total non-NaN count
total_non_nan = column_counts.sum()
print("\nTotal non-NaN values:", total_non_nan)

Non-NaN counts for each column:
id                        760
video_id                  760
start                     687
end                       687
action                    687
action_source             687
conviction_score          687
ticker_name               687
action_date                19
price                     395
quantity                    7
video_title               760
is_rec_present            760
publishedAt               760
channelId                 760
channelTitle              760
videoDescription          760
tags                      730
defaultAudioLanguage      638
duration                  760
isCaptionAvailable        760
viewCount                 760
likeCount                 754
favoriteCount             760
commentCount              726
comments                  760
channelDescription        760
channelViewCount          760
channelSubscriberCount    760
videoCount                760
channelCategory           760
transcript                760
youtube_

### Metric: Time range for earliest and latest video

In [58]:
df['publishedAt'] = pd.to_datetime(df['publishedAt'])

# Find the earliest published date
earliest = df['publishedAt'].min()

# Find the latest published date
latest = df['publishedAt'].max()

print("Earliest publishedAt:", earliest)
print("Latest publishedAt:", latest)

Earliest publishedAt: 2018-01-01 22:00:01+00:00
Latest publishedAt: 2024-07-10 15:45:05+00:00


### Metric: Total Video Duration

In [59]:
df_unique = df.drop_duplicates(subset='video_id', keep='first')

# Sum the 'duration' column from the deduplicated dataframe
total_duration = df_unique['duration'].sum()

print("Total Duration:", total_duration/(60), "minutes")
print("Total Duration:", total_duration/(60*60), "hours")

Total Duration: 2558.3 minutes
Total Duration: 42.638333333333335 hours


### Metric: Average Video Duration

In [60]:
# Drop duplicate rows based on 'video_ids'
df_unique = df.drop_duplicates(subset='video_id', keep='first')

# Calculate the average of the 'duration' column
average_duration = df_unique['duration'].mean()

# Print the result
print("Average duration (in minutes):", average_duration/60)

Average duration (in minutes): 8.88298611111111


This comes to 8 minutes 52 seconds

### Metric: Top 20 ticker names (frequency)

In [61]:
# Get the frequency of each ticker_name and select the top 20
top10 = df['ticker_name'].value_counts().head(20)

# Print the top 20 most common ticker names with their frequencies
print(top10)

ticker_name
TSLA     26
NVDA     11
AMZN     10
AAPL     10
MSFT      8
NIO       7
F         6
SOFI      6
ZM        5
PLTR      5
ORCL      5
GOOG      5
GOOGL     5
AVGO      5
CRWD      4
BABA      4
DKNG      4
CGC       4
AMD       4
FTNT      4
Name: count, dtype: int64


### Metric: number of video segments (from is_rec_present == "Yes")

In [62]:
# Filter rows where is_rec_present is "Yes"
filtered_df = df[df['is_rec_present'] == "Yes"]

# Count the number of rows
row_count = filtered_df.shape[0]
print("Number of segments:", row_count)

Number of segments: 687


### Metric: Average video segment duration

In [63]:
# Calculate the difference for each row (end - start)
df_new = df.copy()
df_new['difference'] = df_new['end'] - df_new['start']

# Compute the average of the differences
average_difference = df_new['difference'].mean()

print("Average segment length (in minutes):", average_difference/60)

Average segment length (in minutes): 1.9124043514096105


### Metric: number of unique channel ids

In [64]:
unique_channel_count = df['channelId'].nunique()
print("Number of unique channelId:", unique_channel_count)

Number of unique channelId: 22


### Metric: min and max channel views and subscribers

In [65]:
# Compute min and max for both columns
result = df[['channelSubscriberCount', 'channelViewCount']].agg(['min', 'max'])
print(result)

     channelSubscriberCount  channelViewCount
min                   21900           1340097
max                  733000         114942285
