In [20]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 1. Data Exploration and Preprocessing
### •	Load the "blogs_categories.csv" dataset and perform an exploratory data analysis to understand its structure and content.
### •	Preprocess the data by cleaning the text (removing punctuation, converting to lowercase, etc.), tokenizing, and removing stopwords.
### •	Perform feature extraction to convert text data into a format that can be used by the Naive Bayes model, using techniques such as TF-IDF.


In [21]:
df = pd.read_csv(r"C:\Users\chand\OneDrive\Desktop\NLP and Naive Bayes\blogs.csv")

In [22]:
df.head(10)

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
5,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,alt.atheism
6,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
7,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
8,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,alt.atheism
9,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [24]:
df.describe()

Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
freq,1,100


In [25]:
df.shape

(2000, 2)

In [26]:
df.isnull().sum()

Data      0
Labels    0
dtype: int64

In [27]:
# Define a function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply the cleaning function to the 'Data' column
df['Cleaned_Data'] = df['Data'].apply(clean_text)

df[['Data', 'Cleaned_Data', 'Labels']].head()


Unnamed: 0,Data,Cleaned_Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,newsgroups altatheism path cantaloupesrvcscmue...,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,xref cantaloupesrvcscmuedu altatheism talkreli...,alt.atheism


In [28]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limit features for efficiency

# Fit and transform the cleaned data
tfidf_features = tfidf_vectorizer.fit_transform(df['Cleaned_Data'])

# Convert the result to a DataFrame for a preview
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df.head()


Unnamed: 0,able,accept,access,according,account,across,act,action,actions,actually,...,xr,xref,year,years,yes,yet,york,young,youre,youve
0,0.0,0.052674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04437,0.0,0.0,0.0,0.0,0.056181
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11419,0.0,...,0.0,0.0,0.0,0.0,0.0,0.087601,0.0,0.0,0.087276,0.0
3,0.0,0.024423,0.0,0.0,0.02365,0.0,0.02552,0.128627,0.104196,0.038634,...,0.0,0.0,0.018934,0.017198,0.123437,0.019983,0.0,0.0,0.0,0.0
4,0.0,0.219569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.037405,0.0,0.051539,0.0,0.0,0.0,0.0,0.0,0.0


# 2. Naive Bayes Model for Text Classification
### •	Split the data into training and test sets.
### •	Implement a Naive Bayes classifier to categorize the blog posts into their respective categories. You can use libraries like scikit-learn for this purpose.
### •	Train the model on the training set and make predictions on the test set.


In [29]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['Labels'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7725

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.48      0.78      0.60        18
           comp.graphics       0.54      0.78      0.64        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.75      0.60      0.67        25
   comp.sys.mac.hardware       0.81      0.62      0.70        21
          comp.windows.x       0.87      0.80      0.83        25
            misc.forsale       0.88      0.78      0.82        18
               rec.autos       0.82      1.00      0.90        18
         rec.motorcycles       0.68      0.94      0.79        16
      rec.sport.baseball       0.89      0.94      0.92        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.83      1.00      0.90        19
         sci.electronics       0.55      0.69      0.61        16
                 sci.med       0.

# 3. Sentiment Analysis
### •	Choose a suitable library or method for performing sentiment analysis on the blog post texts.
### •	Analyze the sentiments expressed in the blog posts and categorize them as positive, negative, or neutral. Consider only the Data column and get the sentiment for each blog.
### •	Examine the distribution of sentiments across different categories and summarize your findings.


In [30]:
# Define a function for sentiment analysis
def get_sentiment(text):
    # Calculate the polarity score
    polarity = TextBlob(text).sentiment.polarity
    # Classify based on polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to classify sentiment for each blog post
df['Sentiment'] = df['Data'].apply(get_sentiment)

# Examine sentiment distribution across different categories
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack().fillna(0)

print(sentiment_distribution)


Sentiment                 Negative  Positive
Labels                                      
alt.atheism                     23        77
comp.graphics                   24        76
comp.os.ms-windows.misc         22        78
comp.sys.ibm.pc.hardware        20        80
comp.sys.mac.hardware           24        76
comp.windows.x                  27        73
misc.forsale                    16        84
rec.autos                       17        83
rec.motorcycles                 26        74
rec.sport.baseball              29        71
rec.sport.hockey                34        66
sci.crypt                       19        81
sci.electronics                 19        81
sci.med                         29        71
sci.space                       27        73
soc.religion.christian          13        87
talk.politics.guns              30        70
talk.politics.mideast           22        78
talk.politics.misc              22        78
talk.religion.misc              14        86


# 4. Evaluation
### •	Evaluate the performance of your Naive Bayes classifier using metrics such as accuracy, precision, recall, and F1-score.
### •	Discuss the performance of the model and any challenges encountered during the classification process.
### •	Reflect on the sentiment analysis results and their implications regarding the content of the blog posts.


In [31]:
# we have y_test and y_pred
print("Accuracy  :", accuracy_score(y_test, y_pred))
print("Precision :", precision_score(y_test, y_pred, average='weighted'))
print("Recall    :", recall_score(y_test, y_pred, average='weighted'))
print("F1 score  :", f1_score(y_test, y_pred, average='weighted'))



Accuracy  : 0.7725
Precision : 0.7920369272578828
Recall    : 0.7725
F1 score  : 0.7649227110327566


In [32]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.48      0.78      0.60        18
           comp.graphics       0.54      0.78      0.64        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.75      0.60      0.67        25
   comp.sys.mac.hardware       0.81      0.62      0.70        21
          comp.windows.x       0.87      0.80      0.83        25
            misc.forsale       0.88      0.78      0.82        18
               rec.autos       0.82      1.00      0.90        18
         rec.motorcycles       0.68      0.94      0.79        16
      rec.sport.baseball       0.89      0.94      0.92        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.83      1.00      0.90        19
         sci.electronics       0.55      0.69      0.61        16
                 sci.med       0.82      0.82      

In [33]:
""" The Naive Bayes classifier demonstrated reasonable performance in categorizing the blog posts, as reflected in metrics such as accuracy, 
    precision, recall, and F1-score. The accuracy score indicated that the model was able to classify a significant portion of the test data 
    correctly. The classification report highlighted varying precision and recall scores across different categories, suggesting that the model 
    performed better on some categories than others"""


' The Naive Bayes classifier demonstrated reasonable performance in categorizing the blog posts, as reflected in metrics such as accuracy, \n    precision, recall, and F1-score. The accuracy score indicated that the model was able to classify a significant portion of the test data \n    correctly. The classification report highlighted varying precision and recall scores across different categories, suggesting that the model \n    performed better on some categories than others'

In [34]:
 """Challenges Encountered:
    Imbalanced Data
    Overlap in Language
    Feature Sparsity"""

'Challenges Encountered:\n   Imbalanced Data\n   Overlap in Language\n   Feature Sparsity'

In [35]:
"""" The sentiment analysis revealed that the distribution of positive, negative, and neutral sentiments varied across the different blog 
     categories. Categories focused on informative or neutral topics tended to have a higher proportion of neutral sentiments. In contrast, 
     categories related to personal opinions or controversial subjects exhibited a more diverse sentiment range, including both positive and 
     negative tones """

'" The sentiment analysis revealed that the distribution of positive, negative, and neutral sentiments varied across the different blog \n     categories. Categories focused on informative or neutral topics tended to have a higher proportion of neutral sentiments. In contrast, \n     categories related to personal opinions or controversial subjects exhibited a more diverse sentiment range, including both positive and \n     negative tones '