In [19]:
import dask.dataframe as dd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk
import re
import pandas as pd
from scipy.spatial.distance import cosine
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cdist
import numpy as np

In [None]:
# function for preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text):  
        return ""
    text = text.lower()  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words and word.isalpha() and len(word) > 4]
    return ' '.join(words)

In [2]:
ddf = dd.read_parquet('nyt_data.parquet')

In [55]:
filtered_ddf = ddf[(ddf['year'] >= 1968) & (ddf['year'] <= 1968)]

filtered_ddf = filtered_ddf.reset_index(drop=True)

In [56]:
filtered_ddf['combined_text'] = filtered_ddf['title'] + filtered_ddf['excerpt']

In [57]:
filtered_ddf['processed_text'] = filtered_ddf['combined_text'].apply(preprocess_text, meta=('processed_text', 'str'))

## Topic modeling by LDA

In [58]:

stop_words = stopwords.words('english')
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stop_words, token_pattern=r'\b[a-zA-Z]+\b')


text_data = filtered_ddf['processed_text'].compute() 
dtm = vectorizer.fit_transform(text_data)


In [63]:
lda = LatentDirichletAllocation(n_components=30, random_state=0)
lda.fit(dtm)

In [64]:

words = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"Theme {idx+1}:")
    print(" ".join([words[i] for i in topic.argsort()[-30:]]))

Theme 1:
black negro international american memorial medal germany tonight history tomorrow today series states yonkers champion round united mexico first jones games chess peter world night square olympic madison james garden
Theme 2:
annual winter states transport freight players rates contract major computer cargo pollution pacific american speed today plans meeting airlines offer world traffic united service african water lines bridge south africa
Theme 3:
nuptials programs billion foundation cities study training funds white research faculty college project plans state budget urban group community campus housing columbia negroes schools protest program student negro school students
Theme 4:
reporter family given addicts smith novel wilkins sutton quake producer merchant patrick slain flood burns cleaver slalom bodies kennedy missing illus freeman giant fiancee cooke lewis marine harlem london found
Theme 5:
action noted border korean offensive israel areas reported planes north ba

In [65]:
topics_lda = {}

# Loop over each topic generated by the LDA model
words = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    # Get the top 30 words for this topic
    top_words = [words[i] for i in topic.argsort()[-30:]]
    
    # Store the words in the dictionary under the topic index
    topics_lda[f"Theme {idx+1}"] = top_words

# Now you can print or access the words for each topic
for theme, words in topics_lda.items():
    print(f"{theme}: {' '.join(words)}")

Theme 1: black negro international american memorial medal germany tonight history tomorrow today series states yonkers champion round united mexico first jones games chess peter world night square olympic madison james garden
Theme 2: annual winter states transport freight players rates contract major computer cargo pollution pacific american speed today plans meeting airlines offer world traffic united service african water lines bridge south africa
Theme 3: nuptials programs billion foundation cities study training funds white research faculty college project plans state budget urban group community campus housing columbia negroes schools protest program student negro school students
Theme 4: reporter family given addicts smith novel wilkins sutton quake producer merchant patrick slain flood burns cleaver slalom bodies kennedy missing illus freeman giant fiancee cooke lewis marine harlem london found
Theme 5: action noted border korean offensive israel areas reported planes north ba

## Topic modeling by LSI

In [72]:
# Preprocess to the text data
filtered_ddf['processed_text'] = filtered_ddf['combined_text'].apply(preprocess_text, meta=('processed_text', 'str'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clementine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# Use TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_ddf['processed_text'])

In [50]:
n_topics = 10  # Number of opics
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)
lsi_matrix = svd_model.fit_transform(tfidf_matrix)

In [53]:
# Display the top terms for each topics and store them
topics_lsi = {}

terms = tfidf_vectorizer.get_feature_names_out()
for i, topic in enumerate(svd_model.components_):

    # Take the 20 best terms
    top_terms = [terms[j] for j in topic.argsort()[-20:]]
    
    # Store the terms
    topics_lsi[f"Topic {i+1}"] = top_terms

for topic, terms in topics_lsi.items():
    print(f"{topic}: {' '.join(terms)}")

Topic 1: picture takes state world today birth notice named letter editor illus marriage announcement elected front editorial cartoon obituary article title
Topic 2: chicago program school record american yesterday washington plans world eisenhower soviet group named million nixon first president state today kennedy
Topic 3: betrothed smith judith debutante harvard barbara susan william future becomes robert prospective fiancee marry alumna graduate student bride engaged married
Topic 4: charles smith susan betrothed jersey harvard obituary marriage fiancee robert alumna graduate becomes future student william marry prospective engaged bride
Topic 5: election article stevenson backs elect rockefeller humphrey state party child senator eisenhower named campaign engaged johnson bride president nixon kennedy
Topic 6: center repts elects david children william fiancee world record khrushchev robert daughter title plans output soviet proceedings million obituary child
Topic 7: heads executi

## Find title for topic

In [12]:
preprocessed_sentences = filtered_ddf['processed_text'].apply(simple_preprocess)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('processed_text', 'object'))



In [None]:
# Train a Word2Vec model and save it
filtered_ddf = ddf[(ddf['year'] >= 1960) & (ddf['year'] <= 2000)]

filtered_ddf = filtered_ddf.reset_index(drop=True)
filtered_ddf['combined_text'] = filtered_ddf['title'] + filtered_ddf['excerpt']

filtered_ddf['processed_text'] = filtered_ddf['combined_text'].apply(preprocess_text, meta=('processed_text', 'str'))

preprocessed_sentences = filtered_ddf['processed_text'].apply(lambda x: x.split())
preprocessed_sentences = preprocessed_sentences.compute().tolist()
model = Word2Vec(sentences=preprocessed_sentences, vector_size=100, window=5, min_count=1, workers=4)
model.wv.save_word2vec_format("word2vec.bin", binary=True)

In [76]:
# Load the model
model = KeyedVectors.load_word2vec_format("word2vec.bin", binary=True)

In [95]:
# Find a word that summarizes the themes found above
def find_theme(list_words, model):
    vectors = []
    # Caculate the average vector for the theme
    for w in list_words:
        if w in model:
            vectors.append(model[w])
    if vectors:
        avg_vec = sum(vectors) / len(vectors)
    else:
        avg_vec = None
        
    # Find the closest word
    closest_words = model.most_similar(positive=[avg_vec], topn=1)

    closest_w, sim_max = closest_words[0] 
    return closest_w

In [96]:
# Print the topics found
for t, ws in topics_lda.items():
    print(t)
    print("Terms")
    print(ws)
    theme = find_theme(ws, model)
    print(f"Theme word : {theme}")
    

Theme 1
Terms
['black', 'negro', 'international', 'american', 'memorial', 'medal', 'germany', 'tonight', 'history', 'tomorrow', 'today', 'series', 'states', 'yonkers', 'champion', 'round', 'united', 'mexico', 'first', 'jones', 'games', 'chess', 'peter', 'world', 'night', 'square', 'olympic', 'madison', 'james', 'garden']
night
Theme word : night
Theme 2
Terms
['annual', 'winter', 'states', 'transport', 'freight', 'players', 'rates', 'contract', 'major', 'computer', 'cargo', 'pollution', 'pacific', 'american', 'speed', 'today', 'plans', 'meeting', 'airlines', 'offer', 'world', 'traffic', 'united', 'service', 'african', 'water', 'lines', 'bridge', 'south', 'africa']
carriers
Theme word : carriers
Theme 3
Terms
['nuptials', 'programs', 'billion', 'foundation', 'cities', 'study', 'training', 'funds', 'white', 'research', 'faculty', 'college', 'project', 'plans', 'state', 'budget', 'urban', 'group', 'community', 'campus', 'housing', 'columbia', 'negroes', 'schools', 'protest', 'program', 's

moscow
Theme word : moscow
Theme 27
Terms
['broadway', 'boycott', 'accord', 'powell', 'walkout', 'schools', 'drivers', 'state', 'talks', 'members', 'shanker', 'return', 'teacher', 'strikes', 'ocean', 'labor', 'washington', 'unions', 'employes', 'children', 'local', 'contract', 'workers', 'school', 'proceedings', 'dispute', 'theater', 'teachers', 'union', 'strike']
unions
Theme word : unions
Theme 28
Terms
['island', 'nancy', 'championships', 'marry', 'bridge', 'takes', 'susan', 'second', 'fiance', 'daughter', 'amateur', 'betrothed', 'australia', 'david', 'event', 'victor', 'world', 'becomes', 'tournament', 'first', 'championship', 'round', 'final', 'married', 'tennis', 'women', 'states', 'united', 'bride', 'today']
tourna
Theme word : tourna
Theme 29
Terms
['credit', 'europe', 'occupation', 'charges', 'money', 'reserves', 'holds', 'monetary', 'countries', 'czechs', 'curbs', 'dollar', 'treaty', 'affairs', 'abroad', 'press', 'moscow', 'invasion', 'nations', 'nuclear', 'policy', 'reserve'