In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import string

In [3]:
labeled_df = pd.read_csv('../../data/IMDB_dataset.csv')
labeled_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/dhurba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess_text(text):
    text = text.encode('ascii', 'ignore').decode('ascii') #Remove emoji
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'<.*?>', '', text) #Remove tag
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split into words
    words = [word for word in words if word not in stop_words]   # Remove stop words
    words = [stemmer.stem(word) for word in words]  # Stemming
    return ' '.join(words)

In [6]:
labeled_df['cleaned_text_with_stop_words_removal'] = labeled_df['review'].apply(preprocess_text)

In [7]:
labeled_df.isnull().sum()
null_rows = labeled_df[labeled_df['cleaned_text_with_stop_words_removal'].isnull()]
print("Rows with null values in 'cleaned_text_with_stop_words_removal':")
print(null_rows)
labeled_df.dropna(subset=['cleaned_text_with_stop_words_removal'], inplace=True)
duplicates_text = labeled_df.duplicated(subset=['cleaned_text_with_stop_words_removal'])
# labeled_df[duplicates_text]

Rows with null values in 'cleaned_text_with_stop_words_removal':
Empty DataFrame
Columns: [review, sentiment, cleaned_text_with_stop_words_removal]
Index: []


In [8]:
countpositive = 0
countnegative=0
countneutral=0
for data in labeled_df['sentiment']:
    # print(data)
    if data == 'positive':
        countpositive = countpositive +1
    elif data == 'negative':
        countnegative = countnegative +1
    else:
        countneutral = countneutral +1


print(countpositive,countnegative,countneutral)

25000 25000 0


In [9]:
labeled_df.to_csv("../../data/imdb_preprocess.csv", index=False,encoding='utf-8')
print(labeled_df)

                                                  review sentiment  \
0      One of the other reviewers has mentioned that ...  positive   
1      A wonderful little production. <br /><br />The...  positive   
2      I thought this was a wonderful way to spend ti...  positive   
3      Basically there's a family where a little boy ...  negative   
4      Petter Mattei's "Love in the Time of Money" is...  positive   
...                                                  ...       ...   
49995  I thought this movie did a down right good job...  positive   
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative   
49997  I am a Catholic taught in parochial elementary...  negative   
49998  I'm going to have to disagree with the previou...  negative   
49999  No one expects the Star Trek movies to be high...  negative   

                    cleaned_text_with_stop_words_removal  
0      one review mention watch 1 oz episod youll hoo...  
1      wonder littl product film techniqu

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
preprocess_df = pd.read_csv('../../data/imdb_preprocess.csv')

# Compute class distribution
class_distribution = preprocess_df['sentiment'].value_counts().sort_index()

# Create a DataFrame for plotting
class_dist_df = pd.DataFrame({
    'Class': class_distribution.index,
    'Count': class_distribution.values
})

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Class', y='Count', data=class_dist_df)

plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x labels for better readability if needed
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('../../reports/imdb/class_distribution.png', format='png', dpi=300,transparent=True)
# plt.show()
plt.close()
