In [1]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
data = pd.read_csv("Topic Modelling/articles.csv", encoding='latin-1')
data

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning
5,You must have seen the news divided into categ...,News Classification with Machine Learning
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP
7,By using a third-party application or API to m...,Send Instagram Messages using Python
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python


In [4]:
# Combine all titles into a single string
titles_text = ' '.join(data['Title'])

# Create a WordCloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

# Plot the Word Cloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

In [5]:
# Sentiment Analysis
data['Sentiment'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Sentiment Distribution
fig = px.histogram(data, x='Sentiment', title='Sentiment Distribution')
fig.show()

In [6]:
# NER
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Named_Entities'] = data['Article'].apply(extract_named_entities)

# Visualize NER
entity_counts = Counter(entity for entities in data['Named_Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

fig = px.bar(entity_df.head(10), x='Entity', y='Count', title='Top 10 Named Entities')
fig.show()

In [7]:
data

Unnamed: 0,Article,Title,Sentiment,Named_Entities
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis,0.666667,{'DATE': ['today']}
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms,0.020833,{}
2,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6,{}
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...,0.625,"{'CARDINAL': ['only two', 'more than two']}"
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning,-0.101429,"{'ORG': ['The Multinomial Naive Bayes', 'Naive..."
5,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6,{}
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP,0.283333,"{'ORG': ['NLP', 'NLP', 'NLP', 'NLP'], 'CARDINA..."
7,By using a third-party application or API to m...,Send Instagram Messages using Python,0.05,"{'ORDINAL': ['third'], 'ORG': ['API', 'Instagr..."
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python,0.406667,"{'PRODUCT': ['Twitter', 'Twitter'], 'CARDINAL'..."
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python,-0.108333,"{'ORG': ['NetFlix'], 'CARDINAL': ['One']}"


In [13]:
# Topic Modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(data['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

# Visualize topics
topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]
data['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]

fig = px.bar(data['Dominant_Topic'].value_counts().reset_index(), x='Dominant_Topic', y='count', title='Topic Distribution')
fig.show()

In [12]:
data

Unnamed: 0,Article,Title,Sentiment,Named_Entities,Dominant_Topic
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis,0.666667,{'DATE': ['today']},Topic 1
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms,0.020833,{},Topic 0
2,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6,{},Topic 1
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...,0.625,"{'CARDINAL': ['only two', 'more than two']}",Topic 3
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning,-0.101429,"{'ORG': ['The Multinomial Naive Bayes', 'Naive...",Topic 4
5,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6,{},Topic 1
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP,0.283333,"{'ORG': ['NLP', 'NLP', 'NLP', 'NLP'], 'CARDINA...",Topic 0
7,By using a third-party application or API to m...,Send Instagram Messages using Python,0.05,"{'ORDINAL': ['third'], 'ORG': ['API', 'Instagr...",Topic 1
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python,0.406667,"{'PRODUCT': ['Twitter', 'Twitter'], 'CARDINAL'...",Topic 2
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python,-0.108333,"{'ORG': ['NetFlix'], 'CARDINAL': ['One']}",Topic 2


In [14]:
# Topic Modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(data['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

# Display top terms for each topic
feature_names = vectorizer.get_feature_names_out()

# Number of top terms to display for each topic
num_top_terms = 10

for topic_idx, topic in enumerate(lda_model.components_):
    top_terms_idx = topic.argsort()[:-num_top_terms - 1:-1]
    top_terms = [feature_names[i] for i in top_terms_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_terms)}\n")

Topic 1: learning, machine, algorithms, deep, books, best, algorithm, use, applications, introduce

Topic 2: python, data, learning, using, learn, machine, want, news, language, stock

Topic 3: insurance, people, want, learn, using, python, analysis, task, sentiment, analyze

Topic 4: clustering, machine, learning, algorithm, using, classification, python, algorithms, implementation, clusters

Topic 5: algorithm, bayes, learning, naive, based, clustering, machine, classification, introduction, similar

