In [1]:
import nltk
import re
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess the input text by removing punctuation, converting to lowercase, tokenizing, and removing stop words.

    Parameters:
    text (str): The input text to preprocess.

    Returns:
    list: A list of filtered tokens.
    """
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)  # Tokenize text
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return filtered_tokens

def extract_ngrams(tokens, n):
    """
    Extract n-grams from a list of tokens.

    Parameters:
    tokens (list): A list of tokens.
    n (int): The number of words in each n-gram.

    Returns:
    list: A list of n-grams.
    """
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

def identify_common_phrases(text, ngram_range=(2, 3), top_n=10):
    """
    Identify common phrases in the input text by extracting n-grams and analyzing their frequencies.

    Parameters:
    text (str): The input text.
    ngram_range (tuple): The range of n-grams to extract (min_n, max_n).
    top_n (int): The number of top n-grams to return.

    Returns:
    dict: A dictionary with n-grams and their frequencies.
    """
    filtered_tokens = preprocess_text(text)
    common_phrases = {}

    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngrams = extract_ngrams(filtered_tokens, n)
        ngram_freq = Counter(ngrams)
        common_phrases[n] = ngram_freq.most_common(top_n)

    return common_phrases

# Load the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='train', categories=['sci.space', 'comp.graphics'])
dataset_text = ' '.join(newsgroups_data.data)  # Combine all text into a single string

# Identify common phrases in the dataset
common_phrases = identify_common_phrases(dataset_text)

# Display common phrases
print("\nCommon Phrases:")
for n, phrases in common_phrases.items():
    print(f"\nTop {n}-grams:")
    for phrase, freq in phrases:
        print(f'{phrase}: {freq}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



Common Phrases:

Top 2-grams:
organization university: 216
distribution world: 145
writes article: 116
space station: 101
henry spencer: 96
world nntppostinghost: 89
_ _: 83
prbaccessdigexcom pat: 81
space shuttle: 76
dont know: 74

Top 3-grams:
distribution world nntppostinghost: 89
u toronto zoology: 67
henryzootorontoedu henry spencer: 55
xnewsreader tin version: 51
tin version 11: 51
_ _ _: 48
organization express access: 43
express access online: 43
access online communications: 43
baalkekelvinjplnasagov ron baalke: 38
