In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('C:\\Users\\Home\\Desktop\\comments\\sentiment_dataset.csv')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df.head()

Unnamed: 0,CommentId,VideoId,Text,Likes
0,UgzarqjaaPC7TbFINNx4AaABAg,dQw4w9WgXcQ,1 billion views for never gonna give you up a...,1371237
1,UgzZfeHlzDX8I39KnBN4AaABAg,dQw4w9WgXcQ,somebody in june 2024,1018
2,UgzZFBL6zRpXgdquvd54AaABAg,dQw4w9WgXcQ,everytime someone likes this comment i will ri...,43
3,UgycbTjm2ndO6Xai_0h4AaABAg,dQw4w9WgXcQ,when you rickroll someone you have to rickroll...,26
4,Ugzw1uX9tgi1L0mL2dB4AaABAg,dQw4w9WgXcQ,in a few years this will be an anthem,14


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CommentId  500 non-null    object
 1   VideoId    500 non-null    object
 2   Text       486 non-null    object
 3   Likes      500 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 15.8+ KB


In [5]:
df.shape

(500, 4)

In [6]:
df.columns

Index(['CommentId', 'VideoId', 'Text', 'Likes'], dtype='object')

In [7]:
columns_to_drop = ['CommentId', 'VideoId','Likes']  
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [8]:
df.head()

Unnamed: 0,Text
0,1 billion views for never gonna give you up a...
1,somebody in june 2024
2,everytime someone likes this comment i will ri...
3,when you rickroll someone you have to rickroll...
4,in a few years this will be an anthem


In [26]:
# Preprocess text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'\@w+|\#', '', text)  # Remove mentions and hashtags
        text = re.sub(r"[^a-zA-Z#]", " ", text)  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return " ".join(tokens)
    else:
        return ""

In [27]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Create sentiment column
df['Sentiment'] = df['cleaned_text'].apply(get_sentiment)

# Save the updated dataset with the sentiment column
df.to_csv('sentiment_dataset_with_sentiment.csv', index=False)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [28]:
df=pd.read_csv('C:\\Users\\Home\\Desktop\\comments\\sentiment_dataset_with_sentiment.csv')

In [34]:

# Handle NaN values
df['cleaned_text'].replace(np.nan, '', inplace=True)

# Assuming the column name for labels is 'Sentiment'
label_column = 'Sentiment'

# Create a sample Sentiment column for demonstration (this should be replaced with your actual label data)
# For demonstration purposes only, you need to replace it with your actual label column
df[label_column] = np.random.choice(['positive', 'negative', 'neutral'], len(df))

# One-hot encode 'Sentiment' column
df_encoded = pd.get_dummies(df[label_column])

# Replace original 'Sentiment' column with one-hot encoded columns
df = pd.concat([df, df_encoded], axis=1)
df.drop(columns=[label_column], inplace=True)

# Rename columns to match the requested encoding (0=positive, 1=negative, 2=neutral)
df.rename(columns={'positive': 0, 'negative': 1, 'neutral': 2}, inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df[[0, 1, 2]], test_size=0.2, random_state=42)
