# Import necessary libraries

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# Download NLTK data files

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
print(nltk.data.find('tokenizers/punkt'))
print(nltk.data.find('corpora/stopwords'))

# Settings for visualizations

In [None]:
%matplotlib inline
plt.style.use('ggplot')

# Load dataset

In [None]:
dataset = load_dataset("ajaykarthick/imdb-movie-reviews")
dataset

## First glance at the dataset

Dataset is pretty big, so I will load only test data for analysis.

In [None]:
df = pd.DataFrame(dataset['test'])

print("First few entries in the dataset:")
print(df.head(10))

I see that review with index 4 has html tags, so let's take a closser look at it 

In [None]:
print(df.iloc[4]['review'])

Yes, the review has html tags, but it is not a big deal, because we can remove them easily if needed.

In [None]:
print("Label:", df.iloc[4]['label'])

Interesting, while it is not obligatory to use "1" for positive and "0" for negative, it is a common practice. However this dataset uses "0" for positive and "1" for negative reviews.

# Statisctics and distributions of the data:

## Quality check

### Check for missing values

In [None]:
df.info()

Gladly, there are no missing values in the dataset.

### Check for duplicates

In [None]:
# Check for duplicate reviews
duplicate_count = df.duplicated(subset='review').sum()
print(f"\nNumber of duplicate reviews: {duplicate_count}")

# Remove duplicates if any
if duplicate_count > 0:
    df = df.drop_duplicates(subset='review').reset_index(drop=True)
    print("Duplicates have been removed.")

There are 16 duplicates in the dataset. While it is not a big deal, I will remove them, since they can skew analysis, models training and evaluation of the results.

In [None]:
df = df.drop_duplicates()
df.shape

### Check class distribution

In [None]:
print("Class distribution:")
print(df['label'].value_counts())

# Visualize class distribution
sns.countplot(x='label', data=df)
plt.title('Sentiment Class Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

The dataset is balanced, so we don't need to worry about class imbalance.

### Analyzing Review Lengths

In [None]:
# Calculate review lengths (number of words)
df['review_length'] = df['review'].apply(lambda x: len(x.split()))

# Basic statistics of review lengths
print("\nReview Length Statistics:")
print(df['review_length'].describe())

# Visualize review length distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['review_length'], bins=50, kde=True)
plt.title('Review Length Distribution')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Boxplot of review length by sentiment
sns.boxplot(x='label', y='review_length', data=df)
plt.title('Review Length by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Number of Words')
plt.show()

There is no big difference in review lengths by sentiment, so we don't need to worry about it while splitting.

# Lexical Analysis (Most Common Words)

In [None]:
# Function to preprocess text
def preprocess_text(text):
    """
    Function to preprocess text data
    """
    
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
df['tokens'] = df['review'].apply(preprocess_text)

# Most Common Words in Positive and Negative Reviews

In [None]:
# Separate positive and negative reviews
positive_reviews = df[df['label'] == 'positive']
negative_reviews = df[df['label'] == 'negative']

# Get all tokens for each class
positive_tokens = [token for tokens in positive_reviews['tokens'] for token in tokens]
negative_tokens = [token for tokens in negative_reviews['tokens'] for token in tokens]

# Get most common words
positive_counter = Counter(positive_tokens)
negative_counter = Counter(negative_tokens)

print("\nMost common words in positive reviews:")
print(positive_counter.most_common(20))

print("\nMost common words in negative reviews:")
print(negative_counter.most_common(20))


# Visualizing Word Clouds

In [None]:
from wordcloud import WordCloud

# Generate word cloud for positive reviews
positive_text = ' '.join(positive_tokens)
positive_wordcloud = WordCloud(width=800, height=400).generate(positive_text)

plt.figure(figsize=(15, 7.5))
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Positive Reviews')
plt.show()

# Generate word cloud for negative reviews
negative_text = ' '.join(negative_tokens)
negative_wordcloud = WordCloud(width=800, height=400).generate(negative_text)

plt.figure(figsize=(15, 7.5))
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Negative Reviews')
plt.show()
