In [27]:
import pandas as pd

import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

df=pd.read_csv('all_users2.csv')

**pre processing the dataset by filling null attributes and conversion to proper type for execution**

In [30]:
#fill NaN values with an empty string
df['review'] = df['review'].fillna('')

#convert all reviews to string
df['review'] = df['review'].astype(str)

#dataset containing two features: comment_id as int and review as string
reviews = df['review']
comment_ids = df['comment_id']

In [29]:
#pre-process : clean and tokenize
def preprocess_text(text, comment_id):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return filtered_tokens, comment_id

sia = SentimentIntensityAnalyzer()
negative_comment_ids = []

#iterate over reviews and comment IDs
for review, comment_id in zip(reviews, comment_ids):
    
    tokens, comment_id = preprocess_text(review, comment_id)

    sentiment_score = sia.polarity_scores(' '.join(tokens))
    
    if sentiment_score['compound'] >= 0.45:
        sentiment = 'positive'
    elif sentiment_score['compound'] <= -0.45:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    if sentiment == 'negative':
        negative_comment_ids.append(comment_id)


In [31]:
data_to_pickle = {
    'model': sia,
    'negative_comment_ids': negative_comment_ids
}

with open('sentiment_analyser.pkl', 'wb') as f:
    pickle.dump(data_to_pickle, f)

with open('sentiment_analyser.pkl', 'rb') as f:
    data = pickle.load(f)

#extract the SentimentIntensityAnalyzer model and the list of negative comment IDs
loaded_sia = data['model']
loaded_negative_comment_ids = data['negative_comment_ids']

print("Model and data have been loaded successfully.")

Model and data have been loaded successfully.
