# Descriptive Statistics Channels

In [2]:
import os
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# folder_path = 'dataset/output_videos_channels_dataset'
# csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv.gz')]
# df_list = []
# for file in csv_files:
#     df = pd.read_csv(file, compression='gzip')
#     df_list.append(df)
# merged_df = pd.concat(df_list, ignore_index=True)
# output_file = 'dataset/output_videos_channels_dataset/merged_videos_channels_output.csv.gz'
# merged_df.to_csv(output_file, index=False, compression='gzip')
# print(f"Merged file saved as {output_file}")


In [4]:
validate_df = pd.read_csv('dataset/output_videos_channels_dataset/videos_from_980000_until_990000_channels.csv.gz', compression='gzip')
validate_df.count()

channel_id             313569
video_id               313569
video_title            313568
video_length           313568
video_description      255846
video_thumbnail_url    313569
video_views_count      313489
video_publish_date     313489
date_of_capture        313569
dtype: int64

In [7]:
dataset_path = os.path.abspath(os.path.join(os.getcwd(), 'dataset', 'output_videos_channels_dataset', 'merged_videos_channels_output.csv.gz'))
channels_dataset_df = pd.read_csv(dataset_path, compression='gzip', engine='python')
channels_dataset_df.dtypes

channel_id             object
video_id               object
video_title            object
video_length           object
video_description      object
video_thumbnail_url    object
video_views_count      object
video_publish_date     object
date_of_capture        object
dtype: object

In [9]:
channels_dataset_df.count()

channel_id             35938268
video_id               35575241
video_title            35575011
video_length           35574433
video_description      30629599
video_thumbnail_url    35002281
video_views_count      34991394
video_publish_date     34991867
date_of_capture        35010807
dtype: int64

In [8]:
index_position = 3
record = channels_dataset_df.iloc[index_position]
print(record)

channel_id                                      UCmyh_vlEbxyXIP0vrxb6wSA
video_id                                                     OCT6dsR3PFM
video_title            Work Out With Conor McGregor in the McGregor F...
video_length                                                        30.0
video_description      Conor dropped by the FAST studio to record a v...
video_thumbnail_url    https://i.ytimg.com/vi/OCT6dsR3PFM/hqdefault.j...
video_views_count                                                29509.0
video_publish_date                                           2 years ago
date_of_capture                               2024-10-03 03:29:02.560504
Name: 3, dtype: object


In [10]:
index_position = 28000000
record = channels_dataset_df.iloc[index_position]
print(record)

channel_id                                      UCPy2TZVSD3oau5nsXDIRgSg
video_id                                                     1SXzu5QhtbE
video_title            PINTURA LAVANDERIA E COZINHA | COMO FAZER tint...
video_length                                                       210.0
video_description      OI GENTEE!! No vídeo de hoje pintamos a cozinh...
video_thumbnail_url    https://i.ytimg.com/vi/1SXzu5QhtbE/hqdefault.j...
video_views_count                                                  338.0
video_publish_date                                            1 year ago
date_of_capture                               2024-10-09 17:55:40.231069
Name: 28000000, dtype: object


## How many of the records with active status & inactive status

In [11]:
not_none_video_df = channels_dataset_df[channels_dataset_df.notnull()]
active_count = int(not_none_video_df["channel_id"].count())
active_count

35938268

: 

In [None]:
none_video_df = channels_dataset_df[channels_dataset_df.isnull()]
inactive_count = int(none_video_df["channel_id"].count())
inactive_count

In [None]:
labels = ['Active', 'Inactive']
sizes = [active_count, inactive_count]
colors = ['green','red']
plt.figure(figsize=(5, 5))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Active vs Inactive Channels', fontsize=14, fontweight='bold', pad=20)
plt.legend(title="isActive", loc="upper left")
plt.show()


## Numerical Stats

### Video Length Stats

In [None]:
video_length_stats = not_none_video_df['video_length'].describe()
print(video_length_stats)


In [None]:
not_none_video_df['log_video_length'] = not_none_video_df['video_length'].apply(lambda x: np.log1p(x))
plt.figure(figsize=(10,6))
sns.histplot(not_none_video_df['log_video_length'], bins=30, kde=True, color='blue')
log_mean = not_none_video_df['log_video_length'].mean()
log_median = not_none_video_df['log_video_length'].median()
plt.axvline(log_mean, color='red', linestyle='--', linewidth=2, label=f'Log Mean: {log_mean:.2f}')
plt.axvline(log_median, color='green', linestyle='--', linewidth=2, label=f'Log Median: {log_median:.2f}')
plt.title('Log-Scaled Distribution of Video Length', fontsize=16)
plt.xlabel('Log(Number of Videos)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend()
plt.show()


### View Count Stats

In [None]:
video_views_count_stats = not_none_video_df['video_views_count'].describe()
print(video_views_count_stats)

In [None]:
not_none_video_df['log_video_views_count'] = not_none_video_df['video_views_count'].apply(lambda x: np.log1p(x))
plt.figure(figsize=(10,6))
sns.histplot(not_none_video_df['log_video_views_count'], bins=30, kde=True, color='blue')
log_mean = not_none_video_df['log_video_views_count'].mean()
log_median = not_none_video_df['log_video_views_count'].median()
plt.axvline(log_mean, color='red', linestyle='--', linewidth=2, label=f'Log Mean: {log_mean:.2f}')
plt.axvline(log_median, color='green', linestyle='--', linewidth=2, label=f'Log Median: {log_median:.2f}')
plt.title('Log-Scaled Distribution of Video Views Counts', fontsize=16)
plt.xlabel('Log(Number of Videos)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend()
plt.show()
