In [None]:
# Import necessary modelus
import pandas as pd
import numpy as np
import tensorflow

from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [None]:
comments = pd.read_csv("/content/drive/MyDrive/Projects/youtube_Comment_Sentiment_Analysis/comments.csv", usecols=lambda column: column != "Unnamed: 0")

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18409 entries, 0 to 18408
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Video ID   18409 non-null  object 
 1   Comment    18408 non-null  object 
 2   Likes      18409 non-null  float64
 3   Sentiment  18409 non-null  float64
dtypes: float64(2), object(2)
memory usage: 575.4+ KB


In [None]:
comments.head()

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0


In [None]:
comments['Comment'].head()

0    Let's not forget that Apple Pay in 2014 requir...
1    Here in NZ 50% of retailers don’t even have co...
2    I will forever acknowledge this channel with t...
3    Whenever I go to a place that doesn’t take App...
4    Apple Pay is so convenient, secure, and easy t...
Name: Comment, dtype: object

In [None]:
comments['Sentiment'].value_counts()

2.0    11432
1.0     4639
0.0     2338
Name: Sentiment, dtype: int64

## Balance the Dataset (Undersampling)

In [None]:
# Apply downsampling
minority = comments[comments['Sentiment'] == 0.0]

downsampled_ones = resample(comments[comments['Sentiment'] == 1.0],
                            replace=False,
                            n_samples=len(minority),
                            random_state=42)

downsampled_twos = resample(comments[comments['Sentiment'] == 2.0],
                            replace=False, n_samples=len(minority),
                            random_state=42)

In [None]:
# Concatenate dataframes
df = pd.concat([minority, downsampled_ones, downsampled_twos]).sample(frac=1).reset_index(drop=True)

In [None]:
# Check the final version of the dataset
df['Sentiment'].value_counts()

0.0    2338
2.0    2338
1.0    2338
Name: Sentiment, dtype: int64

## Explore the Dataset

In [None]:
import random

random_index = random.randint(0, len(df)-5)
for i in df[['Comment', 'Sentiment']][random_index:random_index + 5].itertuples():
  _, com, sent = i
  print(f'Comment: {com}')
  print(f'Sentiment: {sent}')

Comment: I just feel the need to say as someone diagnosed with Anorexia Nervosa I appreciate you Mat for bringing light to the dangers and science of them
Sentiment: 2.0
Comment: I'm so happy you guys are doing this! I'm a software developer and I absolutely love the videos you guys do. I'm really looking forward to the rest of this series!
Sentiment: 2.0
Comment: one of my favorite songs my old roommate/bestfriend and I would scream at the top of our lungs, drunk out of our minds.. ill miss you everyday oliver.. RIP
Sentiment: 2.0
Comment: El arco del maletín vacío fue devastador para los chicos.
Sentiment: 1.0
Comment: Hello Instr. Rachel. I thank you for your great and very good information you provide us for job interview that I really needed. I failed last time in job interview and I wanted to attend a course on it. Many thanks once again Instr. Rachel and all the best.
Sentiment: 2.0


In [None]:
# Check if there is any Null data
df.isnull().sum()

Video ID     0
Comment      1
Likes        0
Sentiment    0
dtype: int64

In [None]:
# Drop Null data rows
df.dropna(inplace=True)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Comment'].to_numpy(), df['Sentiment'].to_numpy(), test_size=0.2, random_state=42)

In [None]:
print(f"""
Shape of X_train: {X_train.shape}
Shape of y_train: {y_train.shape}
Shape of X_test: {X_test.shape}
Shape of y_test: {y_test.shape}
""")


Shape of X_train: (5610,)
Shape of y_train: (5610,)
Shape of X_test: (1403,)
Shape of y_test: (1403,)



In [None]:
X_train[5:10]

array(['My opinion was always that Wyverns are a type/subspecies of dragon. Saying "that\'s not a dragon, that\'s a wyvern" is about as useful as saying "that\'s not a dog, that\'s a border collie!"',
       'I just read that VR is actually beneficial for eyesight. It was an article about some new prescription glasses tech out of Japan that uses the same "science" with how VR headsets seem to improve nearsightedness or prevent at least it from worsening. \n\n.... As if I needed another reason to abandon my entire life for VR lol.',
       'I think in the future the value of having the series x will grow exponentially, when you look at the number of studios Microsoft has been acquiring, and with fewer cross gen titles limiting Devs from using the full power of the next gen consoles.',
       "Back in 2018 when John Oliver did the episode on Corporate Consolidation it inspired me so much that I applied for a Masters of Research program, got into a program, and did my entire research thes

# Converting Text into Numbers

## Text Vectorization (Tokenization)

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import  TextVectorization

In [None]:
# Find the mean of the words that are in sentences
sum_of_words = 0
for i in range(len(df.Comment)):
  word_count = len(df['Comment'].iloc[i].split())
  sum_of_words += word_count

max_length = int(sum_of_words / len(df.Comment))

In [None]:
# Create Text Vectorizer
max_vocab_length = 20000

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

In [None]:
# Fit the Vectorizer to train sentences
text_vectorizer.adapt(X_train)

In [None]:
sample_sentence = random.choice(X_train)
text_vectorizer([sample_sentence]), sample_sentence

(<tf.Tensor: shape=(1, 35), dtype=int64, numpy=
 array([[    2, 14641,   143,  8719,    55,   425,  6907,    32,     8,
             2,   516,     7,    38,    32,     8,   634,    84,    10,
            13,     5,  3856,    57, 18862,  1628,    12,     8,    60,
            77,  5670,   486,     0,     0,     0,     0,     0]])>,
 'The presenter here delivers his knowledge succinctly. He IS the example of what he is teaching. Thank you for a concise,  no extraneous lesson.  It is has been immensely helpful.')

In [None]:
# Get the unique words in the vocabulary

words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"""
Number of words in the vocab: {len(words_in_vocab)}
Top 5 Words: {top_5_words}
Last 5 Words: {bottom_5_words}
""")


Number of words in the vocab: 20000
Top 5 Words: ['', '[UNK]', 'the', 'to', 'and']
Last 5 Words: ['cud', 'cubs', 'cubastic', 'cuarta', 'cuanto']



## Embedding

In [None]:
from tensorflow.keras import layers

# Creating an Embedding layer
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.layers.core.embedding.Embedding at 0x7f375e5fb970>

In [None]:
# Try embeeding layer with on sample sentence
sample_embed = embedding(text_vectorizer([sample_sentence]))
sample_embed

<tf.Tensor: shape=(1, 35, 128), dtype=float32, numpy=
array([[[-3.1694114e-02, -3.0249238e-02, -1.7152868e-02, ...,
          4.7093902e-02,  3.6030602e-02, -9.4115734e-05],
        [ 2.0925168e-02, -1.4087401e-02,  3.4703944e-02, ...,
          3.3781875e-02,  1.3396274e-02, -2.3496523e-03],
        [ 2.1773700e-02,  1.4958572e-02, -4.2117834e-03, ...,
         -3.1957999e-03,  3.5582613e-02, -2.5102282e-02],
        ...,
        [ 1.4040340e-02,  9.8085180e-03,  4.2885531e-02, ...,
         -4.0745091e-02, -1.1156619e-02, -1.9194020e-02],
        [ 1.4040340e-02,  9.8085180e-03,  4.2885531e-02, ...,
         -4.0745091e-02, -1.1156619e-02, -1.9194020e-02],
        [ 1.4040340e-02,  9.8085180e-03,  4.2885531e-02, ...,
         -4.0745091e-02, -1.1156619e-02, -1.9194020e-02]]], dtype=float32)>

# Model

## Model 0: Naive Bayes Base Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
model_0 = Pipeline([('tfid', TfidfVectorizer()), # convert text into numbers
                    ('clf', MultinomialNB())]) # Model the text

# Fit the pipeline to training data
model_0.fit(X_train, y_train)

In [None]:
baseline_preds = model_0.predict(X_test)
baseline_preds[:10]

array([0., 2., 1., 0., 0., 0., 0., 2., 2., 2.])

In [None]:
y_test[:10]

array([1., 2., 1., 1., 0., 0., 0., 1., 2., 1.])

In [None]:
# Function to evaluate: Accuracy, precision, recall, f1-score

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall, f1 score of a binary classification model
  """

  # calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100

  # calcualte model precision, recall and f1-score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
  model_results = {"accuracy": model_accuracy,
                   "precision": model_precision,
                   "recall": model_recall,
                   "f1": model_f1}

  return model_results

In [None]:
# Get baseline results 

baseline_results = calculate_results(y_true=y_test, y_pred=baseline_preds)
baseline_results

{'accuracy': 64.78973627940128,
 'precision': 0.6971595722405731,
 'recall': 0.6478973627940128,
 'f1': 0.6255995420698697}

## Model 1: RNN Model

In [None]:
# Create a RNN Model using functional API
inputs = layers.Input(shape=(1), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(64)(x)
x = layers.Dense(32, activation='relu')(x)
outputs = layers.Dense(3, activation='softmax')(x)
model_1 = tensorflow.keras.Model(inputs, outputs, name='Model_1_LSTM')

In [None]:
# Check out model summary
model_1.summary()

Model: "Model_1_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 35)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 35, 128)           2560000   
                                                                 
 lstm (LSTM)                 (None, 35, 64)            49408     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                      

In [None]:
# Compile the model
model_1.compile(loss='sparse_categorical_crossentropy',
                optimizer=tensorflow.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Train the model
model_1.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f375ddee580>

In [None]:
# Make predictions with the model
model_1_preds_probs = model_1.predict(X_test)
model_1_preds_probs[:5]



array([[4.4008824e-01, 5.1466697e-01, 4.5244716e-02],
       [6.8991113e-04, 2.3097274e-04, 9.9907917e-01],
       [1.2139478e-04, 9.9967545e-01, 2.0324276e-04],
       [9.4756806e-01, 4.3657593e-02, 8.7743513e-03],
       [1.3915162e-01, 8.3685935e-01, 2.3988990e-02]], dtype=float32)

In [None]:
model_1_preds = np.argmax(model_1_preds_probs, axis=1)
model_1_preds[:5]

array([1, 2, 1, 0, 1])

In [None]:
# Check out model performance
model_1_performance = calculate_results(y_true=y_test, y_pred=model_1_preds)
model_1_performance

{'accuracy': 64.36208125445474,
 'precision': 0.6658681183766086,
 'recall': 0.6436208125445474,
 'f1': 0.6463428026502837}

## Model 2: 1D CNN Model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
# Create 1D CNN Model
inputs = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5, padding='valid', activation='relu')(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(3, activation='softmax')(x)
model_2 = tf.keras.Model(inputs, outputs, name='Model_2')

# Compile the model
model_2.compile(loss='sparse_categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Train the model
model_2.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f374a697ee0>

In [None]:
# Make predictions
model_2_preds_probs = model_2.predict(X_test)
model_2_preds_probs[:5]



array([[1.4835057e-01, 3.4087211e-01, 5.1077729e-01],
       [6.7568799e-05, 2.6302209e-06, 9.9992973e-01],
       [1.6656165e-07, 9.9999827e-01, 1.5408864e-06],
       [9.9950975e-01, 4.8229645e-04, 7.9533847e-06],
       [2.2307365e-01, 7.7510214e-01, 1.8242629e-03]], dtype=float32)

In [None]:
model_2_preds = np.argmax(model_2_preds_probs, axis=1)
model_2_preds[:5]

array([2, 2, 1, 0, 1])

In [None]:
# Check out model performance
model_2_performance = calculate_results(y_true=y_test, y_pred=model_2_preds)
model_2_performance

{'accuracy': 63.150392017106206,
 'precision': 0.6315952683822929,
 'recall': 0.631503920171062,
 'f1': 0.6312529796022174}

## Model 3: Transfer Learning

In [None]:
import tensorflow_hub as hub
# Check out pre-trained model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

embed_samples = embed([sample_sentence])
embed_samples.shape, sample_sentence

(TensorShape([1, 512]),
 'The presenter here delivers his knowledge succinctly. He IS the example of what he is teaching. Thank you for a concise,  no extraneous lesson.  It is has been immensely helpful.')

In [None]:
# Create a layer using pre-trained model
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype='string', trainable=False)

In [None]:
# Create Transfer Learning Model using Sequential API
model_3 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(32, activation='relu'),
    layers.Dense(3, activation='softmax')
])

# Compile the model
model_3.compile(loss='sparse_categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# check out Model_3 summary
model_3.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense_3 (Dense)             (None, 32)                16416     
                                                                 
 dense_4 (Dense)             (None, 3)                 99        
                                                                 
Total params: 256,814,339
Trainable params: 16,515
Non-trainable params: 256,797,824
_________________________________________________________________


In [None]:
# Train model_3
model_3.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f37499925b0>

In [None]:
# Make predictions with model_3
model_3_preds_probs = model_3.predict(X_test)
model_3_preds_probs[:5]



array([[6.3429201e-01, 2.9435071e-01, 7.1357206e-02],
       [8.0430385e-04, 1.9390250e-02, 9.7980535e-01],
       [5.7653314e-03, 4.8438558e-01, 5.0984907e-01],
       [9.2982465e-01, 6.4923592e-02, 5.2517457e-03],
       [9.6043366e-01, 3.8506579e-02, 1.0597893e-03]], dtype=float32)

In [None]:
model_3_preds = np.argmax(model_3_preds_probs, axis=1)
model_3_preds[:5]

array([0, 2, 2, 0, 0])

In [None]:
# Check out model performance
model_3_performance = calculate_results(y_true=y_test, y_pred=model_3_preds)
model_3_performance

{'accuracy': 74.5545260156807,
 'precision': 0.7480392334332548,
 'recall': 0.7455452601568069,
 'f1': 0.745448091371831}

In [None]:
np.argmax(model_3.predict(['i dont have any idea about this video']))



1

# Classify Youtube Comments

In [None]:
# Load pre-trained model if you don't want to run all cells
# model = tf.keras.models.load_model('comment_sentiment_model')

In [None]:
def url_separator(url):
  """
  Gets the video ID from given youtube video URL
  """
  _, video_id = url.split('=')
  return video_id

In [None]:
from googleapiclient.discovery import build

def scrape_comments(url, api_key):
  """
  Scrapes comments of given youtube video(url) as a list.
  Needs your API_KEY from Google Cloud Console (YouTube Data API v3)
  """
  API_KEY = api_key

  YOUTUBE_API_SERVICE_NAME = 'youtube'
  YOUTUBE_API_VERSION = 'v3'

  VIDEO_ID = url_separator(url)


  youtube = build(YOUTUBE_API_SERVICE_NAME,
                  YOUTUBE_API_VERSION,
                  developerKey=API_KEY)

  comments = []
  results = youtube.commentThreads().list(
      part='snippet',
      videoId=VIDEO_ID,
      textFormat='plainText',
      maxResults=100).execute()


  while results:
      for item in results['items']:
          comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
          comments.append(comment)

      # Check if there are more comments to retrieve.
      if 'nextPageToken' in results:
          next_page_token = results['nextPageToken']
          results = youtube.commentThreads().list(
              part='snippet',
              videoId=VIDEO_ID,
              textFormat='plainText',
              maxResults=100,
              pageToken=next_page_token).execute()
      else:
          break
  return comments

In [None]:

def comment_sentiment_counter(comments):
  """
  Classifies the comments and returns the count of the classes 

  Args:
    comments (list): A list of comments to be classified

  Returns:
    tuple: A tuple containing the count of the likes, neutral, and dislikes classes

  """
  likes = 0
  dislikes = 0
  neutral = 0
  for i in range((len(comments))):
    com_class = np.argmax(model_3.predict([comments[i]], verbose=0))
    if com_class == 0:
      dislikes += 1
    elif com_class == 1:
      neutral += 1
    else:
      likes +=1

  return likes, neutral, dislikes

In [None]:
comments = scrape_comments("https://www.youtube.com/watch?v=_tzqk8mEOLo", "API_KEY")

In [None]:
comment_sentiment_counter(comments)

(161, 188, 216)