---
title: "Exploratory Data Analysis"
format:
    html: 
        code-fold: false
---

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->

{{< include eda.qmd >}} 

# Codes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load the final text dataset
file_path_text_toxicity = "data/processed-data/text_toxicity.csv"
df = pd.read_csv(file_path_text_toxicity)
# Create text length feature
df['text_length'] = df['text'].str.len()
df.head(6)

print("Data Overview:")
print(df.head())
print("\nData Summary:")
print(df.describe())

categorical_features = ['subreddit', 'type']
numerical_features = ['depth', 'score', 'text_length']


print("\nNumerical Variables Summary:")
print(df[numerical_features].describe().round(3))

print("\nSkewness and Kurtosis:")
for col in numerical_features:
    print(f"\n{col}:")
    print(f"Skewness: {df[col].skew():.3f}")
    print(f"Kurtosis: {df[col].kurtosis():.3f}")

categorical_features = ['subreddit', 'type', 'dt_lean', 'nmf_topic']
print("\nCategorical Variables Summary:")
for col in categorical_features:
    print(f"\n{col} distribution:")
    print(df[col].value_counts(normalize=True).head().round(3))

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['text_length'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Text Length')
plt.xlabel('Text Length')

plt.subplot(1, 2, 2)
sns.histplot(df['score'], bins=30, kde=True, color='orange')
plt.title('Distribution of Score')
plt.xlabel('Score')

plt.tight_layout()
plt.show()

# Skewness and transformation 
from scipy.stats import skew
skewness = skew(df['text_length'])
print(f"Skewness of text length: {skewness:.2f}")

# log transformation
df['log_text_length'] = np.log1p(df['text_length'])

plt.figure(figsize=(6, 4))
sns.histplot(df['log_text_length'], bins=30, kde=True, color='purple')
plt.title('Log-Transformed Text Length Distribution')
plt.xlabel('Log(Text Length)')
plt.show()

# subreddit distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='subreddit', palette='pastel')
plt.title('Distribution of Subreddits')
plt.xlabel('Subreddit')
plt.xticks(rotation=45)
plt.show()

# type distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='type', palette='coolwarm')
plt.title('Type Distribution')
plt.xlabel('Type')
plt.show()

# correlation matrix
numerical_cols = ['text_length', 'score', 'depth']
plt.figure(figsize=(8, 6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

plt.figure(figsize=(6, 4))
sns.scatterplot(x='text_length', y='score', data=df, alpha=0.7, color='green')
plt.title('Text Length vs Score')
plt.xlabel('Text Length')
plt.ylabel('Score')
plt.show()
