In [None]:
video_response = youtube.videos().list(
            part="snippet",
            id="tQxpdY8sWHM"
        ).execute()

In [None]:
# Gets video comments using the youtube api
def get_comments(video_ids: list, n_comments: int, n_buffer_comments: int = 0, lang_code: str = 'en'):

    video_comments = {}

    for video_id in video_ids:
        try:
            comments_response = youtube.commentThreads().list(
                part="snippet", 
                videoId=video_id, 
                maxResults=n_comments + n_buffer_comments, 
                order="relevance"
            ).execute() # Fetch comments

            comments = []
            response_comments = [item['snippet']['topLevelComment']['snippet']['textDisplay'] for item in comments_response["items"]]

            for comment in response_comments:
                try:
                    if ld.detect(comment) == lang_code and len(comments) < n_comments:
                        comments.append(comment)
                except ld.LangDetectException:
                    # Handle the case where the language detection fails
                    pass

            video_comments[video_id] = comments

        except googleapiclient.errors.HttpError as e:
            print(f"Error processing video {video_id}: {e}")

    return video_comments

In [None]:
def run_sentiment_analyzer(analyzer: str, comments: list, relevant_words:list, sentiment_pipeline = None):

    labeled_comments = collections.defaultdict(list)
    
    for comment in comments:
        if is_relevant_comment(comment, relevant_words):
            match analyzer:
                case 'text_blob':
                    #print("Comment outside text_blob function:    ", comment)
                    polarity = TextBlob(comment).sentiment.polarity
                    polarity = POSITIVE if polarity > 0 else NEGATIVE if polarity < 0 else NEUTRAL
                case 'vader':
                    raise Exception("Vader analyzer not implemented yet")
                case 'bert':
                    #print("Comment outside bert function:         ", comment)
                    polarity = bert(comment, sentiment_pipeline, neutral_threshold=0.55)
                case _:
                    raise Exception("Invalid or no analyzer provided")
        else:
            polarity = NEUTRAL

        labeled_comments[polarity].append({
            'text': comment,
            'polarity': polarity
        })

    # Limit comments to top 30 per polarity
    for polarity in labeled_comments:
        labeled_comments[polarity] = labeled_comments[polarity][:30]

    # Flatten the dictionaries into lists
    labeled_comments_flat = [comment for comments in labeled_comments.values() for comment in comments]

    # print(labeled_comments_flat)

    return labeled_comments_flat

In [None]:
video_comments = read_video_comments("video_comments.json")["ro130m-f_yk"]
relevant_words = ["ai", "controversy", "jobs", "cars", "music", "art", "programming", "languages"]
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
labeled_comments = run_sentiment_analyzer("bert", video_comments, relevant_words, sentiment_pipeline)
df = pd.DataFrame(labeled_comments)
df["polarity"].value_counts()

In [None]:
video_comments = read_video_comments("video_comments.json")
for video_id in video_comments:
    print(video_id, len(video_comments[video_id]))

In [None]:
def run_sentiment_analyzer(analyzer: str, comments: list, relevant_words:list, sentiment_pipeline = None):

    labeled_comments = []
    
    for comment in comments:
        match analyzer:
            case 'text_blob':
                #print("Comment outside text_blob function:    ", comment)
                polarity = TextBlob(comment).sentiment.polarity
                polarity = POSITIVE if polarity > 0 else NEGATIVE if polarity < 0 else NEUTRAL
            case 'vader':
                raise Exception("Vader analyzer not implemented yet")
            case 'bert':
                #print("Comment outside bert function:         ", comment)
                polarity = bert(comment, sentiment_pipeline, neutral_threshold=0.55)
            case _:
                raise Exception("Invalid or no analyzer provided")

        labeled_comments.append({
            'text': comment,
            'polarity': polarity
        })

    return labeled_comments

In [None]:
# List of words to include
# topics = ["ai controversy", "ai jobs controversy", "ai cars controversy", "ai music controversy", "ai art controversy", "ai programming controversy"]
topics = ["ai controversy", "ai jobs controversy", "ai cars controversy", "ai music controversy", "ai art controversy", "ai programming controversy"]
relevant_words = ["ai", "controversy", "jobs", "cars", "music", "art", "programming", "languages"]

# Extract videos on topic
if (os.path.exists("video_ids.json")):
    print("Reading video ids from file")
    video_ids = read_video_ids("video_ids.json")
else:
    print("Searching for videos using the API")
    video_ids = get_video_ids(topics, 5, 5)
    write_video_ids(video_ids, "video_ids.json")


video_ids = sum(read_video_ids("video_ids.json").values(), [])

# Retrieve comments for each video
if (os.path.exists("video_comments.json")):
    print("Reading comments from file")
    video_comments = read_video_comments("video_comments.json")
else:
    print("Retrieving comments using the API")
    video_comments = get_comments(video_ids, 250)
    write_video_comments(video_comments, "video_comments.json")

all_comments_TB = []
all_comments_BERT = []

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

for comment_list in video_comments.values():
    all_comments_TB.extend(run_sentiment_analyzer("text_blob", comment_list, relevant_words))
    all_comments_BERT.extend(run_sentiment_analyzer("bert", comment_list, relevant_words, sentiment_pipeline))

df_TB = pd.DataFrame(all_comments_TB)
df_BERT = pd.DataFrame(all_comments_BERT)

print(df_TB.head())
print(df_BERT.head())

df_TB.to_csv('TB_comments.csv', index=False)
df_BERT.to_csv('BERT_comments.csv', index=False)

Devesh's Old Code

In [None]:
#Total Comments = 30*3*6*5 = 2700

# List of words to include
list_of_words = ["ai", "controversy", "jobs", "cars", "music", "art", "programming", "languages"]
topics = ["ai controversy", "ai jobs controversy", "ai cars controversy", "ai music controversy", "ai art controversy", "ai programming controversy"]

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment(comment, sentiment_pipeline, neutral_threshold=0.55): # Default threshold is 0.55
    # Truncate comments to avoid exceeding the model's maximum token length
    comment_truncated = comment[:512] 
    result = sentiment_pipeline(comment_truncated)[0]
    label = result['label']
    score = result['score']

    # Determine sentiment based on score threshold
    if score < neutral_threshold:
        polarity = 0  # Neutral
    else:
        polarity = 1 if label == 'POSITIVE' else -1

    return polarity

def fetch_video_details_and_comments(video_id): # Fetch video details and comments for a given video ID.
    video_data_TB = collections.defaultdict(list)
    video_data_bert = collections.defaultdict(list)

    try:
        comments_response = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=250).execute() # Fetch comments

        # Fetch comments and their polarities.
        for item in comments_response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            if any(re.search(r'\b' + re.escape(word) + r'\b', comment, re.IGNORECASE) for word in list_of_words):
                try:
                    # Check if the comment is in English
                    if ld.detect(comment) == 'en':
                        # TextBlob polarity
                        polarity_TB = TextBlob(comment).sentiment.polarity
                        polarity_TB = 1 if polarity_TB > 0 else (-1 if polarity_TB < 0 else 0)
                        video_data_TB[polarity_TB].append({
                            'text': comment,
                            'polarity': polarity_TB
                        })

                        # BERT polarity
                        polarity_bert = get_sentiment(comment, sentiment_pipeline)
                        print(polarity_bert)
                        video_data_bert[polarity_bert].append({
                            'text': comment,
                            'polarity': polarity_bert
                        })          
                except ld.LangDetectException:
                    # Handle the case where the language detection fails
                    pass  
            elif any(re.search(r'\b' + re.escape(word) + r'\b', comment, re.IGNORECASE) is None for word in list_of_words):
                try:
                    # Check if the comment is in English
                    if ld.detect(comment) == 'en':
                        # TextBlob polarity
                        polarity_TB = 0
                        video_data_TB[polarity_TB].append({
                            'text': comment,
                            'polarity': polarity_TB
                        })

                        # BERT polarity
                        polarity_bert = 0
                        video_data_bert[polarity_bert].append({
                            'text': comment,
                            'polarity': polarity_bert
                        })          
                except ld.LangDetectException:
                    # Handle the case where the language detection fails
                    pass  
    
    except googleapiclient.errors.HttpError as e:
        print(f"Error processing video {video_id}: {e}")

    # Limit comments to top 30 per polarity for TextBlob and BERT
    for polarity in video_data_TB:
        video_data_TB[polarity] = video_data_TB[polarity][:30] # Limit to 30 comments per polarity
    for polarity in video_data_bert:
        video_data_bert[polarity] = video_data_bert[polarity][:30] 

    # Flatten the dictionaries into lists
    video_data_TB_flat = [comment for comments in video_data_TB.values() for comment in comments]
    video_data_bert_flat = [comment for comments in video_data_bert.values() for comment in comments]

    return video_data_TB_flat, video_data_bert_flat

def main():

    if (len(read_video_ids("video_ids.json")) == 6): #If we already have saved the videos, we can just read them.
        video_ids = read_video_ids("video_ids.json")
        for i in video_ids:
            print(i, ":", video_ids[i])
    else: #Otherwise, we need to fetch the videos and save them.
        video_ids = get_video_ids(topics, 5, 5)
        write_video_ids(video_ids, "video_ids.json")
        for i in video_ids:
            print(i, ":", video_ids[i])
    
    all_data_TB = [] 
    all_data_bert = [] 
    for topic,ids in video_ids.items():
        for video_id in ids:
            video_data_TB, video_data_bert  = fetch_video_details_and_comments(video_id)
            all_data_TB.extend(video_data_TB) # Add video data to the list.
            all_data_bert.extend(video_data_bert) # Add video data to the list.

    df_TB = pd.DataFrame(all_data_TB)
    df_bert = pd.DataFrame(all_data_bert)

    print(df_TB.head())
    print(df_bert.head())
    df_TB.to_csv('TB_comments.csv', index=False)
    df_bert.to_csv('BERT_comments.csv', index=False)

main()
        

In [None]:
def evaluate_model(model : Pipeline, X_train : list, X_test : list, y_train : list, y_test : list):
    class_values = [-1, 0, 1]
    class_names = ["Negative", "Neutral ", "Positive"]
    model_name = model.named_steps['classifier'].__class__.__name__
    y_pred_train, y_pred_test, y_proba = run_model(model, X_train, X_test, y_train, y_test)
    train_score = accuracy_score(y_train, y_pred_train)
    test_score = accuracy_score(y_test, y_pred_test)

    # Binarize the labels for multi-label/multi-class ROC
    y_test_bin = label_binarize(y_test, classes = class_values)
    y_pred_bin = label_binarize(y_pred_test, classes = class_values)

    # Print out train and test accuracies
    print("Train score:", train_score)
    print("Test Score:", test_score)
    
    # Call functions for evaluation
    display_cf_report(y_test, y_pred_test)
    plot_confusion_matrix(y_test, y_pred_test, class_names)
    list_TP_FP(y_test_bin, y_pred_bin, class_names)
    # plot_roc_curve(y_test_bin, y_proba, class_names, f"ROC for {model_name}", f"Macro-Average ROC for {model_name}")

    return test_score

In [None]:
def evaluate_classifiers(models, X_train, y_train, X_test, y_test):
    log_cols=["Classifier", "Train Accuracy", "Test Accuracy"]
    log = pd.DataFrame(columns=log_cols)
    for model in models:
        model.fit(X_train, y_train)

        y_pred_train = model.predict(X_train)
        score_train = accuracy_score(y_train, y_pred_train)
        print("Accuracy of {} with train dataset is: {:0.3f}".format(model, score_train))
        
        y_pred_test = model.predict(X_test)
        score_test = accuracy_score(y_test, y_pred_test)
        print("Accuracy of {} with test dataset is: {:0.3f}".format(model, score_test))
    
        log_entry = pd.DataFrame([[model.named_steps['classifier'].__class__.__name__, score_train*100, score_test*100]], columns=log_cols)
        log = pd.concat([log, log_entry])
    
    compare_metrics_graph(log)