# YouTube Comments and Video Stats Analysis

## I. Import Libraries

In [1]:
import re
import string
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from collections import Counter
from textblob import TextBlob

# Natural Language Toolkit
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Language Detector
try:
    import langid
except ModuleNotFoundError:
    !pip install langid
    import langid

try:
    from langdetect import detect, LangDetectException
except ModuleNotFoundError:
    !pip install langdetect
    from langdetect import detect, LangDetectException
        


### Download NLTK data files 

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/repl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/repl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Set up visualization aesthetics

In [3]:
sns.set(style="whitegrid")

## II. Load the Datasets

- Save the datasets in variables

In [4]:
# CSV format
comments_csv_url = 'https://raw.githubusercontent.com/cogxen/databank/main/youtube-comments-video-stats/comments.csv'
video_stats_csv_url = 'https://raw.githubusercontent.com/cogxen/databank/main/youtube-comments-video-stats/video-stats.csv'

# XLSX format
comments_xlsx_url = 'https://raw.githubusercontent.com/cogxen/databank/main/youtube-comments-video-stats/comments-eng.xlsx'
video_stats_xlsx_url = 'https://raw.githubusercontent.com/cogxen/databank/main/youtube-comments-video-stats/video-stats-eng.xlsx'

### (`.csv`) Comments & Video Stats DataFrame 

- Comments Data

In [5]:
comments_csv = pd.read_csv(comments_csv_url)
comments_csv.head()

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95,1
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19,0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161,2
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8,0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34,2


- Video Stats Data

In [6]:
video_stats_csv = pd.read_csv(video_stats_csv_url)
video_stats_csv.head()

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,23/08/2022,tech,3407.0,672.0,135612.0
1,The most EXPENSIVE thing I own.,b3x28s61q3c,24/08/2022,tech,76779.0,4306.0,1758063.0
2,My New House Gaming Setup is SICK!,4mgePWWCAmA,23/08/2022,tech,63825.0,3338.0,1564007.0
3,Petrol Vs Liquid Nitrogen | Freezing Experimen...,kXiYSI7H2b0,23/08/2022,tech,71566.0,1426.0,922918.0
4,Best Back to School Tech 2022!,ErMwWXQxHp0,08/08/2022,tech,96513.0,5155.0,1855644.0


### (`.xlsx`) Comments & Video Stats

- Comments Data

In [7]:
comments_xlsx = pd.read_excel(comments_xlsx_url)
comments_xlsx.head()

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95,1
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19,0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161,2
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8,0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34,2


- Video Stats Data

In [8]:
video_stats_xlsx = pd.read_excel(video_stats_xlsx_url)
video_stats_xlsx.head()

Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,2022-08-23,tech,3407,672,135612
1,The most EXPENSIVE thing I own.,b3x28s61q3c,2022-08-24,tech,76779,4306,1758063
2,My New House Gaming Setup is SICK!,4mgePWWCAmA,2022-08-23,tech,63825,3338,1564007
3,Best Back to School Tech 2022!,ErMwWXQxHp0,2022-08-08,tech,96513,5155,1855644
4,Brewmaster Answers Beer Questions From Twitter...,18fwz9Itbvo,2021-11-05,tech,33570,1643,943119


- Define file paths and options
- Load the datasets
    - Rename the columns
- Check data integrity  

In [9]:
# Defile file paths and options based on the source
data_source = 'eff'

if data_source == 'org':
    print('Loading original source...')
    comments_file = comments_csv_url
    video_stats_file = video_stats_csv_url
    load_function = pd.read_csv
    load_options = {'parse_dates': ['Published At'], 'dayfirst': True}
    comments_columns_mapping = []
else:
    print('Loading efficient source...')
    comments_file = comments_xlsx_url
    video_stats_file = video_stats_xlsx_url
    load_function = pd.read_excel
    load_options = {}
    comments_columns_mapping = ['Video ID', 'Comment', 'Comment_Likes', 'Comment_Sentiment']
    
# Load the datasets
video_stats = load_function(video_stats_file, **load_options)
comments = load_function(comments_file, **load_options)

# Rename columns
if data_source != 'org':
    comments.columns = comments_columns_mapping
    
# Definitions
metrics = ['Views', 'Likes', 'Comments']

# Checks data integrity
initial_shape = video_stats.shape
print(f'Initial shape of video stats: {initial_shape}')
duplicate_count = initial_shape[0] - video_stats[['Title', 'Published At', 'Keyword', 'Likes', 'Comments', 'Views']].drop_duplicates().shape[0]
print(f'Number of duplicate rows: {duplicate_count}')

Loading efficient source...
Initial shape of video stats: (1577, 7)
Number of duplicate rows: 0


## III. Helper Functions

### Text Preprocessing Method

- (`preprocess_text`), a fundamental step in natural language processing, involves transforming raw text data into a structured format suitable for analysis by converting text to lowercase, removing punctuation and stop words, tokenizing, and lemmatizing, ultimately rejoining the tokens into a single string. This process is crucial for enabling effective natural language processing tasks.

In [10]:
def preprocess_text(raw_text):
    # Ensure the input is a string
    if not isinstance(raw_text, str):
        raw_text = str(raw_text)
    
    # Convert text to lowercase
    text_lower = raw_text.lower()
    
    # Remove punctuation
    text_no_punct = text_lower.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    word_tokens = word_tokenize(text_no_punct)
    
    # Remove stopwords and punctuation tokens
    stopwords_set = set(stopwords.words('english'))
    filtered_tokens = [word for word in word_tokens if word not in stopwords_set and word not in string.punctuation]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Rejoin the tokens into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

### Language Determination Method

- (`determine_language`), it employs a hybrid approach, leveraging both `langdetect` and `langid` libraries to identify the dominant language within a given text. It returns 'en' if either library confidently detects English; otherwise, it classifies the language as 'unknown'. This strategy is particularly valuable for applications that handle multilingual content and necessitate a robust language identification mechanism with a fallback option.

In [11]:
# Function to determine the language of a given text using langdetect
def determine_language_langdetect(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Function to determine the language of a given text using langid
def determine_language_langid(text):
    language, _ = langid.classify(text)
    return language

# Combined function to determine the language using both methods
def determine_language(text):
    lang_langdetect = determine_language_langdetect(text)
    lang_langid = determine_language_langid(text)
    
    if lang_langdetect == 'en' or lang_langid == 'en':
        return 'en'
    else:
        return 'unknown'

### Sentiment Classification Method

- (`classify_sentiment`), it maps polarity scores to their respective sentiment categories ('Negative',
 'Neutral', or 'Positive'), providing a valuable tool for applications requiring text categorization based on sentiment analysis.

In [12]:
def classift_sentiment(polarity_score):
    if polarity_score < -0.2:
        return 'Negative'
    elif polarity_score < 0.2:
        return 'Neutral'
    else:
        return 'Positive'

### Sentiment Analysis Method

- (`analyze_sentiment`), it analyzes the sentiment of the input text using the TextBlob library and returns the polarity score, which is useful for understanding the emotional tone of the text.

In [13]:
def analyze_sentiment(input_text):
    if not isinstance(input_text, str):
        input_text = str(input_text)
    
    tb = TextBlob(input_text)
    return tb.sentiment.polarity

### Text Length Calculation Method

- (`calculate_text_length`), it calculates the length of the input text, converting it to a string if necessary.

In [14]:
def calculate_text_length(input_text):
    if not isinstance(input_text, str):
        input_text = str(input_text)
    return len(input_text)

### Text Type Check Method

- (`is_text_string`), it checks if the input is a string and returns a boolean value indicating the result.

In [15]:
def is_text_string(input_text):
    return isinstance(input_text, str)

### Engagement Rate Computation Method

- (`compute_engagement_rate`), it computes the engagement rate based on the number of likes, comments, and views, with optional weights for likes and comments. This method is useful for social media analytics and content performance evaluation.

**Paramaters**
1. `likes` (`int`): The number of likes.
2. `comments` (`int`): The number of comments.
3. `views` (`int`): The number of views.
4. `like_weight` (`float`, `optional`): The weight assigned to likes. Default is 1.
5. `comment_weight` (`float`, `optional`): The weight assigned to comments. Default is 1.5.

**Returns**
- `float`: The engagement rate as a percentage.

In [16]:
def compute_engagement_rate(likes, comments, views, like_weight=1, comment_weight=1.5):
    if views == 0:
        return 0
    return ((like_weight * likes + comment_weight * comments) / views) * 100

### Sort Order Generation Method

- (`generate_sort_order`), it generates a sort order based on the median of a specified metric for groups in the dataframe, which is useful for ranking groups by their performance on various metrics.

**Parameters**
1. `dataframe` (`pd.DataFrame`): The dataframe containing the data.
2. `group_column` (`str`): The column to group by.

**Returns**
- `list`: A list of group names sorted by their median engagement rate in descending order.

In [17]:
def generate_sort_order(dataframe, group_column, metric_column):
    group_medians = dataframe.groupby(group_column)[metric_column].median().sort_values(ascending=False)
    sort_order = group_medians.index.tolist()
    return sort_order

### Median Metric by Group Plotting Method

- (`plot_median_metric_by_group`), it plots the median of a specified metric by group, which is useful for visualizing the performance of groups on various metrics.

**Parameters**
1. `dataframe` (`pd.DataFrame`): The dataframe containing the data.
2. `group_column` (`str`): The column to group by.
3. `metric_column` (`str`, `optional`): The column representing the metric to plot.

In [18]:
def plot_median_metric_by_group(dataframe, group_column, metric_column):
    # Calculaate the median of the specified metric by group and sort by value
    grouped_median = dataframe.groupby(group_column)[metric_column].median().sort_values(ascending=False).reset_index()
    
    # Melt the dataframe for plotting    
    melted_data = grouped_medians.melt(id_vars=[group_column])
    melted_data.columns = [group_column, 'Metric', 'Value']
    
    # Filter for the specified metric and sort by value
    filtered_data = melted_data[melted_data['Metric'] == metric_column].sort_values(by='Value', ascending=False)
    
    # Plot the data
    sns.barplot(data=filtered_data, y=group_column, x='Value')

### Outlier Removal by Category Method

- (`remove_outliers_by_category`), it removes outliers from the DataFrame based on the category of the item using the Interquartile Range (IQR) method, which is useful for cleaning data before analysis.

**Parameters**
1. `dataframe` (`pd.DataFrame`): The input DataFrame.
2. `category_column` (`str`): The name of the category column.
3. `value_column` (`str`): The name of the value column.

**Returns**
- `pd.DataFrame`: The DataFrame with outliers removed.

In [19]:
def remove_outliers_by_category(dataframe, category_column, value_column):
    def filter_outliers(group):
        q1 = group[value_column].quantile(0.25)
        q3 = group[value_column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        return group[group[value_column].between(lower_bound, upper_bound)]
    
    return data.groupby(category_col).apply(filter_outliers).reset_index(drop=True)

### Token Filtering & Most Common Tokens by Category Extraction Method

- (`filter_non_alphanumeric_tokens`), it filters out tokens that are purely numeric or single characters, which is useful for cleaning tokenized text data.

In [20]:
def filter_non_alphanumeric_tokens(tokens):
    return [str(token) for token in tokens if not re.match(r'^\d+$|^.{1}$', str(token))]            

- (`get_most_common_tokens_by_category`), it extracts the most common tokens for each category in the dataframe, which is useful for analyzing the most frequent terms in different categories.

**Parameters** (`get_most_common_tokens_by_category`)
1. `dataframe` (`pd.DataFrame`): The input dataframe.
2. `category_column` (`str`): The name of the column containing category information.
3. `tokens_column` (`str`): The name of the column containing tokens.
4. `top_n` (`int`, `optional`): The number of most common tokens to return. Default is 3.

**Returns**
- `dict`: A dictionary where keys are categories and values are lists of the most common tokens.

In [21]:
def get_most_common_tokens_by_category(dataframe, category_column, tokens_column, top_n=3):
    # Group the dataframe by the specified category column
    grouped = dataframe.groupby(category_column)
    
    # Initialize a dictionary to store the most common tokens for each category
    common_tokens = {}
    
    # Iterate through each group (category)
    for category, group in grouped:
        all_tokens = []
        # Collect all tokens from the specified tokens column, filtering out non-alphanumeric tokens
        for tokens in group[tokens_column]:
            filtered_tokens = filter_non_alphanumeric_tokens(tokens)
            all_tokens.extend(filtered_tokens)
            
        # Count the frequency of each token
        token_counts = Counter(all_tokens)
        
        # Retrieve the top_n most common tokens for the current category
        common_tokens[category] = token_counts.most_common(top_n)
        
    return common_tokens

### Keyword Extraction Method

-  (`find_keywords`), it extracts keywords from a given text that are present in a provided list of keywords, which is useful for categorizing or tagging text based on their content. 

**Parameters**
1. `text` (`str`): The text to search for keywords.
2. `keyword_list` (`list`): A list of keywords to search for in the text.

**Returns**
- `list`: A list of found keywords or ['other'] if no keywords are found.

In [22]:
def find_keywords(text, keyword_list):
    found_keywords = [keyword for keyword in keyword_lit in keyword in text]
    return found_keywords if found_keywords else ['other']

## IV. (EDA) Data Preprocessing & Cleaning

### Engagement Metrics Cleaning

- Replace all negative values in engangement metrics with `NaN` and fills missing values with the median, which is useful for ensuring data and preparing the data for analysis.

#### Video Stats Data

In [23]:
metrics = ['Views', 'Likes', 'Comments']

# Replace negative values in engagement metrics with NaN
for metric in metrics:
    video_stats.loc[video_stats[metric] < 0 , metric] = pd.NA
    
# Fill missing values with median of each metric
for metric in metrics:
    median_value = video_stats[metric].median()
    video_stats[metric].fillna(median_value, inplace=True)
    
# Check the dataframe to ensure cleaning process
print(f'Cleaned DataFrame shape: {video_stats.shape}')

Cleaned DataFrame shape: (1577, 7)


#### Comments Data

In [24]:
comments.isnull().sum()

Video ID             224
Comment                1
Comment_Likes          0
Comment_Sentiment      0
dtype: int64

### Data Filtering & Language Detection

- It filters out videos with no vies, cleans the title, detects the language, and filters out non-English titles, which is useful for ensuring data quality and consistency.

#### Video Stats Data

- Filter out no views videos

In [25]:
# Filter out rows where 'Views' is greater than 0
video_stats = video_stats[video_stats['Views'] > 0] 

# Check how many rows with no views were removed
print(f"Removed {initial_shape[0] - video_stats.shape[0]} rows after filtering views")

Removed 2 rows after filtering views


#### Comments Data

- Drop rows with missing values

In [26]:
# Drop rows with missing values in the comments DataFrame
initial_comments_count = comments.shape[0]
comments.dropna(inplace=True)
dropped_rows_count = initial_comments_count - comments.shape[0]

print(f"Dropped {dropped_rows_count} rows with missing values")

Dropped 225 rows with missing values
