# 1. Data Exploration and Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('blogs_categories.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Data,Labels
0,0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism


In [None]:
#Preprocess the text data by cleaning it.(Convert text to lowercase,Remove punctuation and special characters,Tokenize the text,Remove stopwords)
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Preprocess function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords((common words like 'the', 'is' that don’t add much meaning))
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to the 'Data' column
df['cleaned_data'] = df['Data'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df

In [None]:
#Convert the text data into a numerical format using TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features based on your dataset size

# Transform the text data
X = tfidf.fit_transform(df['cleaned_data']).toarray()

# The target labels (categories)
y = df['Labels']

# 2. Naive Bayes Model for Text Classification

In [None]:
#Split the data into training and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Train a Naive Bayes classifier and make predictions.
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes classifier
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions
y_pred = nb_model.predict(X_test)


In [None]:
y_pred

# 3. Sentiment Analysis

In [None]:
pip install textblob

In [None]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

# Apply sentiment analysis on the 'Data' column
df['Sentiment'] = df['Data'].apply(get_sentiment)

# Display sentiment distribution
print(df['Sentiment'].value_counts())


# Sentiment Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sentiment distribution across all categories
print("\nSentiment distribution across all categories")
plt.figure(figsize=(8, 6))
sns.countplot(x='Sentiment', data=df)
plt.title('Overall Sentiment Distribution')
plt.show()

# Sentiment distribution within each category
print("\nSentiment distribution within each category")
plt.figure(figsize=(12, 6))
sns.countplot(x='Labels', hue='Sentiment', data=df)
plt.title('Sentiment Distribution by Category')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Group by categories and sentiment
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack()
print(sentiment_distribution)

# 4. Evaluation

In [None]:
# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

In conclusion, the Naive Bayes classifier, with proper preprocessing and feature extraction, can effectively categorize blog posts. Performance evaluation using metrics such as accuracy, precision, recall, and F1-score provides a comprehensive view of the model’s effectiveness. Challenges such as class imbalance and text preprocessing need careful handling to ensure robust performance.

Sentiment analysis adds another layer of understanding, revealing the emotional tone of the blog posts. These insights can be crucial for content strategy, reader engagement, and marketing efforts.

By combining text classification with sentiment analysis, you gain a powerful toolkit for extracting and leveraging insights from textual data, enhancing both the analytical and strategic capabilities of your organization.