<a href="https://colab.research.google.com/github/dhineshkannan6543/Machine-Learning-/blob/main/LDA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Sample data based on the provided news articles
data = {
    "title": [
        "Warning: HLVX is at high risk of performing badly",
        "Warning: NTBLQ is at high risk of performing badly",
        "Warning: MYNZ is at high risk of performing badly",
        "Warning: ZVSA is at high risk of performing badly",
        "Warning: CLDI is at high risk of performing badly",
        "Warning: AVTE is at high risk of performing badly",
        "Warning: IVVD is at high risk of performing badly",
        "Warning: CHRS is at high risk of performing badly",
        "Warning: TSBX is at high risk of performing badly",
        "Warning: LXEO is at high risk of performing badly",
        "Warning: LSB is at high risk of performing badly",
        "Warning: AZTR is at high risk of performing badly",
        "Warning: PRME is at high risk of performing badly",
        "Warning: LIPO is at high risk of performing badly",
        "Warning: ZNTL is at high risk of performing badly",
        "Who is Susie Wiles? Trump taps trusted campaign manager with a reputation for controlling his worst impulses to be chief of staff",
        "Red Rock Resorts, Inc. GAAP EPS of $0.48 beats by $0.11, revenue of $468M beats by $7.09M",
        "EMX Royalty reports Q3 results",
        "Eupraxia Pharmaceuticals Reports Third Quarter 2024 Financial Results",
        "Exclusive: Jack Dorsey’s Block ordered employees not to discuss board member Jay-Z amid more layoffs",
        "Pentagon chief says he has not changed position on Guantanamo Bay plea deals",
        "Taiwan's TSMC says US investment plan are unchanged after election",
        "DiamondRock Hospitality FFO of $0.26 in-line, revenue of $285.1M misses by $0.83M",
        "Regulus Therapeutics GAAP EPS of -$0.21 misses by $0.04",
        "aTyr Pharma GAAP EPS of -$0.23 misses by $0.01",
        "Turkcell Iletisim reports Q3 results",
        "Goldman Sachs BDC reports Q3 results",
        "Three charged in One Direction singer Liam Payne's death",
        "Labor dispute stops Canadian canola oil, forestry exports from West Coast"
    ]
}

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['title'])

# Fit the LDA model with 10 components
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

# Get the topic distribution for each document
topic_distribution = lda.transform(X)

# Determine the dominant topic for each document
dominant_topics = topic_distribution.argmax(axis=1)

# Map numerical topics to descriptive domain names
domain_mapping = {
    0: "Stock Warnings",
    1: "Corporate Governance",
    2: "Financial Performance",
    3: "Legal Issues",
    4: "Market Updates",
    5: "Investment News",
    6: "Earnings Reports",
    7: "Company Announcements",
    8: "Industry Trends",
    9: "Economic Indicators"
}

# Add the descriptive domain names to the DataFrame
df['Domain'] = pd.Series(dominant_topics).map(domain_mapping)

# Display the DataFrame with classified domains
print(df)

                                                title                 Domain
15  Who is Susie Wiles? Trump taps trusted campaig...  Company Announcements
16  Red Rock Resorts, Inc. GAAP EPS of $0.48 beats...        Industry Trends
17                     EMX Royalty reports Q3 results         Market Updates
18  Eupraxia Pharmaceuticals Reports Third Quarter...        Investment News
19  Exclusive: Jack Dorsey’s Block ordered employe...        Investment News
20  Pentagon chief says he has not changed positio...  Financial Performance
21  Taiwan's TSMC says US investment plan are unch...         Market Updates
22  DiamondRock Hospitality FFO of $0.26 in-line, ...         Market Updates
23  Regulus Therapeutics GAAP EPS of -$0.21 misses...        Industry Trends
24     aTyr Pharma GAAP EPS of -$0.23 misses by $0.01    Economic Indicators
25               Turkcell Iletisim reports Q3 results        Industry Trends
26               Goldman Sachs BDC reports Q3 results         Market Updates

In [5]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Load the JSON file
with open('/content/business_news_Q42024.json', 'r') as file:
    data = json.load(file)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['title'])

# Fit the LDA model with 20 components
lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda.fit(X)

# Get the topic distribution for each document
topic_distribution = lda.transform(X)

# Determine the dominant topic for each document
dominant_topics = topic_distribution.argmax(axis=1)

# Map numerical topics to descriptive domain names
domain_mapping = {
    0: "Stock Warnings",
    1: "Corporate Governance",
    2: "Financial Performance",
    3: "Legal Issues",
    4: "Market Updates",
    5: "Investment News",
    6: "Earnings Reports",
    7: "Company Announcements",
    8: "Industry Trends",
    9: "Economic Indicators",
    10: "Technology News",
    11: "Healthcare Updates",
    12: "Political Developments",
    13: "Environmental News",
    14: "Consumer Trends",
    15: "Global Economy",
    16: "Regulatory Changes",
    17: "Mergers & Acquisitions",
    18: "Innovation & Research",
    19: "Social Issues"
}

# Add the descriptive domain names to the DataFrame
df['Domain'] = pd.Series(dominant_topics).map(domain_mapping)

# Display the DataFrame with classified domains
print(df)

              author                                              title  \
...              ...                                                ...   
56186        Reuters  Heard in Davos: What we learned from the WEF i...   
56187  Puneet Javeri  Stock Market Today: All You Need To Know Going...   
56188           None  Mediators aim to shore up fragile ceasefires i...   
56189    Chris Roush   Business Insider CEO Peng describes its strategy   
56190           None  Reeves to tell Labour MPs to back growth strategy   

                                             description  \
...                                                  ...   
56186  Heard in Davos: What we learned from the WEF i...   
56187  Stock Market Today: All You Need To Know Going...   
56188  Gazans to be allowed into the northern part of...   
56189  Status newsletter operator Oliver Darcy spoke ...   
56190  UK chancellor expected to support Heathrow air...   

                                                     