In [1]:
import nltk
from nltk.corpus import reuters

# Ensure that the Reuters dataset is downloaded
nltk.download('reuters')
nltk.download('punkt')

# Get the fileids (document identifiers)
file_ids = reuters.fileids()

# Display the first few documents and their categories
for file_id in file_ids[:5]:
    print(f"File ID: {file_id}")
    print(f"Categories: {reuters.categories(file_id)}")
    print(f"Text: {reuters.raw(file_id)[:500]}...")  # Show first 500 characters of the document
    print("-" * 50)


[nltk_data] Downloading package reuters to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


File ID: test/14826
Categories: ['trade']
Text: ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo...
--------------------------------------------------
File ID: test/14828
Categories: ['grain']
Text: CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of China's grain
  stocks, the China Daily said.
      It also said that each year 1.575 mln tonnes, or 25 pct, of
  China's fruit output are left to rot, and 2.1 mln tonnes, or up
  to 30 pct, of its veget

In [3]:
import nltk
import pandas as pd
from nltk.corpus import reuters

# Ensure required NLTK datasets are downloaded
nltk.download('reuters')
nltk.download('punkt')

# Retrieve the file IDs (document IDs)
file_ids = reuters.fileids()

# Prepare an empty list to store the data
data = []

# Loop through file_ids and extract categories and raw text
for file_id in file_ids:
    categories = reuters.categories(file_id)
    raw_text = reuters.raw(file_id)
    # Add a row with file_id, categories and raw text
    data.append({
        'file_id': file_id,
        'categories': categories,
        'text': raw_text
    })

# Convert the list to a pandas DataFrame
reuters_df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print(reuters_df.head())


[nltk_data] Downloading package reuters to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


      file_id                                      categories  \
0  test/14826                                         [trade]   
1  test/14828                                         [grain]   
2  test/14829                                [crude, nat-gas]   
3  test/14832  [corn, grain, rice, rubber, sugar, tin, trade]   
4  test/14833                             [palm-oil, veg-oil]   

                                                text  
0  ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...  
1  CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...  
2  JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...  
3  THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n  ...  
4  INDONESIA SEES CPO PRICE RISING SHARPLY\n  Ind...  


In [7]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Convert to a DataFrame for easier viewing
newsgroups_df = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': [newsgroups.target_names[label] for label in newsgroups.target]
})

# Display a sample of the data
print("Dataset Sample:")
print(newsgroups_df.sample(5))

# Display unique categories
print("\nAvailable Categories:")
print(newsgroups.target_names)


Dataset Sample:
                                                    Text  \
6591   \nYou're assuming that their normal rotation c...   
8338   For sale by owners who were also the builders\...   
15866  I would be interested in hearing from anyone w...   
13279  \nI'm not 100% sure, but I think the T800 was ...   
14376  I have recently plunged into PC World. I have ...   

                       Category  
6591                  sci.space  
8338               misc.forsale  
15866           sci.electronics  
13279   comp.os.ms-windows.misc  
14376  comp.sys.ibm.pc.hardware  

Available Categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
