# Sentiment Analysis - Exploratory Data Analysis (EDA)



## Load Dataset

In [None]:
import pandas as pd
try:
    df = pd.read_csv('https://github.com/erlanggadewasakti/Prinsip-Sains-Data/releases/download/prod/sa-psd-dataset.csv')
    print("Dataset berhasil dimuat.")
except FileNotFoundError:
    print("Error: File dataset tidak ditemukan. Pastikan path file sudah benar.")
except Exception as e:
    print(f"Terjadi error saat membaca file: {e}")

# Exploratory Data Analysis

## Data Loading & Inspection

In [None]:
display(df.head())
df.info()
print("\nJumlah nilai yang hilang per kolom:")
print(df.isnull().sum())

## Data Preprocessing

In [None]:
import re

# Extract sentiment labels from output column
df['sentiment'] = df['output'].str.replace(r'^[A-E]:\s*', '', regex=True)

# Clean and normalize input text
df['cleaned_input'] = df['input'].str.lower()

# Map sentiment categories
df['sentiment'] = df['sentiment'].map({
    'very positive': 'positive',
    'very negative': 'negative',
    'positive': 'positive',
    'negative': 'negative',
    'neutral': 'neutral'
})

display(df[['output', 'sentiment', 'input', 'cleaned_input']].head())

## Sentiment Distribution Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sentiment_colors = {'positive': 'green', 'negative': 'orange', 'neutral': 'blue'}

plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, hue='sentiment', palette=sentiment_colors, legend=False)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

## Text Statistics Analysis

In [None]:
# Calculate text statistics
df['char_length'] = df['cleaned_input'].str.len()
df['word_count'] = df['cleaned_input'].str.split().str.len()
df['avg_word_length'] = df['char_length'] / df['word_count']
df['avg_word_length'] = df['avg_word_length'].fillna(0)

display(df[['cleaned_input', 'char_length', 'word_count', 'avg_word_length']].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize text statistics distribution using histograms
text_stats = ['char_length', 'word_count', 'avg_word_length']
for stat in text_stats:
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x=stat, hue='sentiment', kde=True, multiple='stack', palette=sentiment_colors)
    plt.title(f'Distribution of {stat} by Sentiment')
    plt.xlabel(stat)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Visualize text statistics distribution using boxplots
text_stats = ['char_length', 'word_count', 'avg_word_length']
for stat in text_stats:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x='sentiment', y=stat, palette=sentiment_colors)
    plt.title(f'Distribution of {stat} by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel(stat)
    plt.show()

## Content Analysis (N-grams)

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

# Download stopwords
try:
    stopwords_english = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stopwords_english = stopwords.words('english')

print("Libraries imported and stopwords downloaded.")

In [None]:
# Analyze N-grams for each sentiment
ngram_ranges = [(1, 1), (2, 2), (3, 3)]
sentiments = df['sentiment'].unique()

for sentiment in sentiments:
    print(f"Analyzing sentiment: {sentiment}")
    sentiment_df = df[df['sentiment'] == sentiment]
    cleaned_text = sentiment_df['cleaned_input'].dropna()

    if cleaned_text.empty:
        print(f"No cleaned text available for sentiment: {sentiment}")
        continue

    current_sentiment_color = sentiment_colors.get(sentiment, 'gray')

    for n_range in ngram_ranges:
        print(f"  Analyzing {n_range}-grams")
        vectorizer = CountVectorizer(ngram_range=n_range, stop_words=stopwords_english)

        try:
            X = vectorizer.fit_transform(cleaned_text)
        except ValueError as e:
            print(f"  Could not fit vectorizer for {n_range}-grams and sentiment {sentiment}: {e}")
            continue

        sum_words = X.sum(axis=0)
        words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
        top_ngrams = words_freq[:20]

        if not top_ngrams:
            print(f"  No {n_range}-grams found for sentiment: {sentiment}")
            continue

        top_ngrams_df = pd.DataFrame(top_ngrams, columns=['ngram', 'count'])

        plt.figure(figsize=(10, 6))
        sns.barplot(x='count', y='ngram', data=top_ngrams_df, color=current_sentiment_color)
        plt.title(f'Top 20 {n_range}-grams for Sentiment: {sentiment}')
        plt.xlabel('Count')
        plt.ylabel(f'{n_range}-gram')
        plt.tight_layout()
        plt.show()

print("N-gram analysis complete.")

## Visual Analysis (Word Clouds)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download stopwords
try:
    stopwords_english = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stopwords_english = stopwords.words('english')

# Generate word cloud for all text
all_text = ' '.join(df['cleaned_input'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white',
                     stopwords=stopwords_english).generate(all_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Text (excluding stopwords)')
plt.show()

# Identify overall most common words (excluding stopwords)
all_words = all_text.split()
all_words = [word for word in all_words if word not in stopwords_english]
overall_word_counts = Counter(all_words)
most_common_overall = set([word for word, count in overall_word_counts.most_common(50)])

# Generate word clouds per sentiment (excluding overall common words)
sentiments = df['sentiment'].unique()

for sentiment in sentiments:
    sentiment_text = ' '.join(df[df['sentiment'] == sentiment]['cleaned_input'].dropna())

    if sentiment_text:
        sentiment_words = sentiment_text.split()
        sentiment_words_filtered = [word for word in sentiment_words
                                   if word not in most_common_overall
                                   and word not in stopwords_english]
        filtered_sentiment_text = ' '.join(sentiment_words_filtered)

        if filtered_sentiment_text:
            wordcloud = WordCloud(width=800, height=400, background_color='white',
                                stopwords=stopwords_english).generate(filtered_sentiment_text)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Word Cloud for Sentiment: {sentiment} (excluding overall common words)')
            plt.show()
        else:
            print(f"No significant words remaining for sentiment: {sentiment} after filtering.")
    else:
        print(f"No cleaned text available for sentiment: {sentiment}")

## Summary

---

# Additional Analysis

---

# Advanced Analysis

## Descriptive Statistics

In [None]:
import pandas as pd
import numpy as np

# Statistik deskriptif untuk setiap sentimen
print("=" * 80)
print("STATISTIK DESKRIPTIF BERDASARKAN SENTIMEN")
print("=" * 80)

text_features = ['char_length', 'word_count', 'avg_word_length']
sentiments = df['sentiment'].unique()

descriptive_stats = {}

for sentiment in sentiments:
    print(f"\n{'='*80}")
    print(f"Sentimen: {sentiment.upper()}")
    print(f"{'='*80}")

    sentiment_df = df[df['sentiment'] == sentiment]
    stats_dict = {}

    for feature in text_features:
        stats = {
            'count': sentiment_df[feature].count(),
            'mean': sentiment_df[feature].mean(),
            'median': sentiment_df[feature].median(),
            'std': sentiment_df[feature].std(),
            'min': sentiment_df[feature].min(),
            'max': sentiment_df[feature].max(),
            'q25': sentiment_df[feature].quantile(0.25),
            'q75': sentiment_df[feature].quantile(0.75)
        }
        stats_dict[feature] = stats

    # Tampilkan dalam bentuk DataFrame untuk visualisasi yang lebih baik
    stats_df = pd.DataFrame(stats_dict).T
    print(f"\n{stats_df.round(2)}")

    descriptive_stats[sentiment] = stats_df

print("\n" + "=" * 80)
print("STATISTIK DESKRIPTIF KESELURUHAN")
print("=" * 80)
overall_stats = df[text_features].describe()
print(f"\n{overall_stats.round(2)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize distribution with Violin Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(text_features):
    sns.violinplot(data=df, x='sentiment', y=feature, ax=axes[idx], palette=sentiment_colors)
    axes[idx].set_title(f'Distribusi {feature} berdasarkan Sentimen', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Sentimen', fontsize=10)
    axes[idx].set_ylabel(feature, fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Visualize comparison of Mean and Median
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(text_features):
    mean_values = [df[df['sentiment'] == s][feature].mean() for s in sentiments]
    median_values = [df[df['sentiment'] == s][feature].median() for s in sentiments]

    x = np.arange(len(sentiments))
    width = 0.35

    axes[idx].bar(x - width/2, mean_values, width, label='Mean', alpha=0.8)
    axes[idx].bar(x + width/2, median_values, width, label='Median', alpha=0.8)

    axes[idx].set_xlabel('Sentimen', fontsize=10)
    axes[idx].set_ylabel('Nilai', fontsize=10)
    axes[idx].set_title(f'Perbandingan Mean & Median {feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xticks(x)
    axes[idx].set_xticklabels(sentiments)
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Calculate overall correlation matrix
correlation_matrix = df[text_features].corr()

print("=" * 60)
print("MATRIKS KORELASI KESELURUHAN")
print("=" * 60)
print(correlation_matrix.round(3))
print("\n")

# Visualize overall correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            vmin=-1, vmax=1, fmt='.3f')
plt.title('Matriks Korelasi Antar Variabel Numerik (Keseluruhan)',
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Analyze correlation per sentiment
print("=" * 60)
print("MATRIKS KORELASI PER SENTIMEN")
print("=" * 60)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, sentiment in enumerate(sentiments):
    sentiment_df = df[df['sentiment'] == sentiment]
    corr_matrix = sentiment_df[text_features].corr()

    print(f"\nSentimen: {sentiment.upper()}")
    print("-" * 60)
    print(corr_matrix.round(3))

    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, ax=axes[idx],
                vmin=-1, vmax=1, fmt='.3f', cbar_kws={"shrink": 0.8})
    axes[idx].set_title(f'Korelasi - Sentimen {sentiment.capitalize()}',
                       fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Encode sentiment as numeric values
sentiment_encoding = {'positive': 1, 'neutral': 0, 'negative': -1}
df['sentiment_encoded'] = df['sentiment'].map(sentiment_encoding)

# Create correlation matrix between numerical features and sentiment
correlation_features = ['char_length', 'word_count', 'avg_word_length', 'sentiment_encoded']
correlation_matrix = df[correlation_features].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
      square=True, linewidths=2, cbar_kws={"shrink": 0.8},
      vmin=-1, vmax=1, fmt='.3f', annot_kws={'size': 12, 'weight': 'bold'})

plt.title('Matriks Korelasi antara Fitur Numerik dan Sentimen\n(Positive=1, Neutral=0, Negative=-1)',
      fontsize=14, fontweight='bold', pad=20)

# Customize labels
plt.xlabel('Fitur', fontsize=12, fontweight='bold')
plt.ylabel('Fitur', fontsize=12, fontweight='bold')

# Rename labels for better readability - posisi di tengah cell
labels = ['Panjang Karakter', 'Jumlah Kata', 'Rata-rata Panjang Kata', 'Sentimen']
plt.xticks(np.arange(len(labels)) + 0.5, labels, rotation=45, ha='right')
plt.yticks(np.arange(len(labels)) + 0.5, labels, rotation=0)

plt.tight_layout()
plt.show()

# Print correlation values with sentiment
print("=" * 70)
print("KORELASI ANTARA FITUR NUMERIK DAN SENTIMEN")
print("=" * 70)
print(f"\nKorelasi dengan Sentimen:")
print(f"  - Panjang Karakter (char_length): {correlation_matrix.loc['char_length', 'sentiment_encoded']:.4f}")
print(f"  - Jumlah Kata (word_count): {correlation_matrix.loc['word_count', 'sentiment_encoded']:.4f}")
print(f"  - Rata-rata Panjang Kata (avg_word_length): {correlation_matrix.loc['avg_word_length', 'sentiment_encoded']:.4f}")
print("\nInterpretasi:")
print("  Nilai mendekati 0 = Tidak ada korelasi linear")
print("  Nilai mendekati 1 = Korelasi positif kuat")
print("  Nilai mendekati -1 = Korelasi negatif kuat")
print("=" * 70)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot matrix to visualize relationships between variables
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Character length vs word count
for sentiment in sentiments:
    sentiment_data = df[df['sentiment'] == sentiment]
    axes[0, 0].scatter(sentiment_data['char_length'], sentiment_data['word_count'],
                      label=sentiment, alpha=0.5, s=20, color=sentiment_colors[sentiment])
axes[0, 0].set_xlabel('Character Length', fontsize=10)
axes[0, 0].set_ylabel('Word Count', fontsize=10)
axes[0, 0].set_title('Character Length vs Word Count', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Character length vs average word length
for sentiment in sentiments:
    sentiment_data = df[df['sentiment'] == sentiment]
    axes[0, 1].scatter(sentiment_data['char_length'], sentiment_data['avg_word_length'],
                      label=sentiment, alpha=0.5, s=20, color=sentiment_colors[sentiment])
axes[0, 1].set_xlabel('Character Length', fontsize=10)
axes[0, 1].set_ylabel('Average Word Length', fontsize=10)
axes[0, 1].set_title('Character Length vs Average Word Length', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Word count vs average word length
for sentiment in sentiments:
    sentiment_data = df[df['sentiment'] == sentiment]
    axes[1, 0].scatter(sentiment_data['word_count'], sentiment_data['avg_word_length'],
                      label=sentiment, alpha=0.5, s=20, color=sentiment_colors[sentiment])
axes[1, 0].set_xlabel('Word Count', fontsize=10)
axes[1, 0].set_ylabel('Average Word Length', fontsize=10)
axes[1, 0].set_title('Word Count vs Average Word Length', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Correlation insights
axes[1, 1].axis('off')
axes[1, 1].text(0.5, 0.5, 'Korelasi Insight:\n\n' +
                '• char_length & word_count\n  memiliki korelasi positif tinggi\n\n' +
                '• avg_word_length relatif\n  independen dari panjang teks',
                ha='center', va='center', fontsize=11,
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Outlier Detection and Trend Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Calculate sentiment distribution and proportions
sentiment_counts = df['sentiment'].value_counts()
sentiment_proportions = df['sentiment'].value_counts(normalize=True) * 100

print("=" * 60)
print("DISTRIBUSI DAN PROPORSI SENTIMEN")
print("=" * 60)
print("\nJumlah Data per Sentimen:")
print(sentiment_counts)
print("\nProporsi (%) per Sentimen:")
print(sentiment_proportions.round(2))

# Visualize proportions with pie chart and bar chart
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie Chart
colors_list = [sentiment_colors[s] for s in sentiment_counts.index]
axes[0].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%',
           startangle=90, colors=colors_list, explode=[0.05]*len(sentiment_counts))
axes[0].set_title('Proporsi Sentimen dalam Dataset', fontsize=14, fontweight='bold')

# Bar Chart
axes[1].bar(sentiment_counts.index, sentiment_counts.values,
           color=[sentiment_colors[s] for s in sentiment_counts.index], alpha=0.8, edgecolor='black')
axes[1].set_xlabel('Sentimen', fontsize=11)
axes[1].set_ylabel('Jumlah Data', fontsize=11)
axes[1].set_title('Distribusi Jumlah Data per Sentimen', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

for i, (sentiment, count) in enumerate(zip(sentiment_counts.index, sentiment_counts.values)):
    axes[1].text(i, count + 100, f'{count}\n({sentiment_proportions[sentiment]:.1f}%)',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Identify outliers using IQR method
for feature in text_features:
    plt.figure(figsize=(12, 6))

    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]

    print(f"\n{'='*60}")
    print(f"Outlier Detection for {feature}")
    print(f"{'='*60}")
    print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
    print(f"Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}")
    print(f"Number of Outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")

    sns.boxplot(data=df, x='sentiment', y=feature, palette=sentiment_colors)
    plt.title(f'Outlier Detection - {feature}', fontsize=14, fontweight='bold')
    plt.xlabel('Sentiment')
    plt.ylabel(feature)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

## Sentiment Proportion Analysis

In [None]:
# Descriptive statistics for each sentiment
print("=" * 80)
print("STATISTIK DESKRIPTIF BERDASARKAN SENTIMEN")
print("=" * 80)

text_features = ['char_length', 'word_count', 'avg_word_length']
sentiments = df['sentiment'].unique()

for sentiment in sentiments:
    print(f"\n{'='*80}")
    print(f"Sentimen: {sentiment.upper()}")
    print(f"{'='*80}")

    sentiment_df = df[df['sentiment'] == sentiment]
    stats_dict = {}

    for feature in text_features:
        stats = {
            'count': sentiment_df[feature].count(),
            'mean': sentiment_df[feature].mean(),
            'median': sentiment_df[feature].median(),
            'std': sentiment_df[feature].std(),
            'min': sentiment_df[feature].min(),
            'max': sentiment_df[feature].max(),
            'q25': sentiment_df[feature].quantile(0.25),
            'q75': sentiment_df[feature].quantile(0.75)
        }
        stats_dict[feature] = stats

    stats_df = pd.DataFrame(stats_dict).T
    print(f"\n{stats_df.round(2)}")

print("\n" + "=" * 80)
print("STATISTIK DESKRIPTIF KESELURUHAN")
print("=" * 80)
overall_stats = df[text_features].describe()
print(f"\n{overall_stats.round(2)}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Compare text characteristics across sentiments
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(text_features):
    sentiment_means = [df[df['sentiment'] == s][feature].mean() for s in sentiments]
    sentiment_medians = [df[df['sentiment'] == s][feature].median() for s in sentiments]

    x = np.arange(len(sentiments))
    width = 0.35

    bars1 = axes[idx].bar(x - width/2, sentiment_means, width, label='Mean', alpha=0.8, edgecolor='black')
    bars2 = axes[idx].bar(x + width/2, sentiment_medians, width, label='Median', alpha=0.8, edgecolor='black')

    for i, bar in enumerate(bars1):
        bar.set_color(sentiment_colors[sentiments[i]])
    for i, bar in enumerate(bars2):
        bar.set_color(sentiment_colors[sentiments[i]])
        bar.set_alpha(0.5)

    axes[idx].set_xlabel('Sentimen', fontsize=11)
    axes[idx].set_ylabel(f'{feature}', fontsize=11)
    axes[idx].set_title(f'Perbandingan {feature} Antar Sentimen', fontsize=12, fontweight='bold')
    axes[idx].set_xticks(x)
    axes[idx].set_xticklabels(sentiments)
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Summary table of characteristics per sentiment
print("\n" + "=" * 80)
print("RINGKASAN KARAKTERISTIK TEKS PER SENTIMEN")
print("=" * 80)

summary_data = []
for sentiment in sentiments:
    sentiment_df = df[df['sentiment'] == sentiment]
    summary_data.append({
        'Sentiment': sentiment,
        'Count': len(sentiment_df),
        'Proportion (%)': (len(sentiment_df) / len(df)) * 100,
        'Avg Char Length': sentiment_df['char_length'].mean(),
        'Avg Word Count': sentiment_df['word_count'].mean(),
        'Avg Word Length': sentiment_df['avg_word_length'].mean()
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df.round(2))