In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Data Exploration and Preprocessing

In [2]:
data=pd.read_csv(r"C:\Users\disha\Downloads\Naive Bayes and Text Mining\Naive Bayes and Text Mining\blogs_categories.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Data,Labels
0,0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  19997 non-null  int64 
 1   Data        19997 non-null  object
 2   Labels      19997 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.8+ KB


In [4]:
data.isnull().sum()

Unnamed: 0    0
Data          0
Labels        0
dtype: int64

In [7]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]  
    return ' '.join(words)
data['Processed_Data'] = data['Data'].apply(preprocess_text)
print(data.head())

   Unnamed: 0                                               Data       Labels  \
0           0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism   
1           1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
2           2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3           3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
4           4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   

                                      Processed_Data  
0  xref cantaloup srv cs cmu edu alt atheism alt ...  
1  xref cantaloup srv cs cmu edu alt atheism alt ...  
2  newsgroup alt atheism path cantaloup srv cs cm...  
3  xref cantaloup srv cs cmu edu alt atheism alt ...  
4  xref cantaloup srv cs cmu edu alt atheism soc ...  


# Feature extraction using TF-IDF

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['Processed_Data'])
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Labels'])


# Naive Bayes Model for Text Classification

# Split the data into training and test sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement a Naive Bayes classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

# Sentiment Analysis

In [16]:
from textblob import TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'
data['Sentiment'] = data['Data'].apply(get_sentiment)
print(data.head())


   Unnamed: 0                                               Data       Labels  \
0           0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism   
1           1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
2           2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3           3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
4           4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   

                                      Processed_Data Sentiment  
0  xref cantaloup srv cs cmu edu alt atheism alt ...  positive  
1  xref cantaloup srv cs cmu edu alt atheism alt ...  positive  
2  newsgroup alt atheism path cantaloup srv cs cm...  positive  
3  xref cantaloup srv cs cmu edu alt atheism alt ...  positive  
4  xref cantaloup srv cs cmu edu alt atheism soc ...  negative  


# Examine sentiment distribution across categories

In [17]:
sentiment_distribution = data.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack().fillna(0)
print(sentiment_distribution)

Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism               0.199000    0.000  0.801000
comp.graphics             0.250000    0.001  0.749000
comp.os.ms-windows.misc   0.236000    0.000  0.764000
comp.sys.ibm.pc.hardware  0.238000    0.001  0.761000
comp.sys.mac.hardware     0.242000    0.000  0.758000
comp.windows.x            0.290000    0.002  0.708000
misc.forsale              0.229000    0.000  0.771000
rec.autos                 0.201000    0.000  0.799000
rec.motorcycles           0.262000    0.000  0.738000
rec.sport.baseball        0.249000    0.000  0.751000
rec.sport.hockey          0.297000    0.000  0.703000
sci.crypt                 0.209000    0.000  0.791000
sci.electronics           0.211000    0.000  0.789000
sci.med                   0.219000    0.000  0.781000
sci.space                 0.235000    0.001  0.764000
soc.religion.christian    0.171515    0.000  0.828485
talk.politics.guns        0.

# Evaluation

In [18]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.8910
Precision: 0.8910
Recall: 0.8910
F1 Score: 0.8904


# sentiment analysis results


The Naive Bayes classifier performed with an accuracy of 0.8910, precision of 0.8910, recall of 0.8910, and an F1 score of 0.8904.

Challenges encountered included balancing the dataset and tuning the hyperparameters for the Naive Bayes model.

Sentiment analysis showed the distribution of positive, negative, and neutral sentiments across different categories. This provides insights into the overall tone and mood of the blog posts in each category.


# Conclusion

The analysis and classification of blog posts using a Naive Bayes classifier and sentiment analysis provided valuable insights into the content and sentiment of the blog posts. The Naive Bayes model showed good performance in classifying the posts, and the sentiment analysis revealed interesting patterns in the tone of the posts across different categories