In [32]:
# necessary imports
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from pprint import pprint
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import matplotlib.pyplot as plt
import nltk
from collections import defaultdict

In [33]:
#Functions used in LDA

def text_split_new(text) :

    #Initialize a set of frequent and not relevent words in english
    #stop_words = set(stopwords.words('english'))

    #Remove from text all punctuations
    text = re.sub(r'[^\w\s]', '', text)
    

    #Tokenise the text in words and convert in small letters
    words = word_tokenize(text)
    
    #Filter the words depending on their grammatical class
    
    text_splited = []

    for word in words:

        #Find the grammatical class (pos) of a word and tag it with it
        pos = pos_tag([word])[0][1]
        #Keep only the nouns (singular and plural)
        if pos in ['NN', 'NNS']: #and word not in stop_words:
            text_splited.append(word)
    
    return text_splited

def bag_of_words(text_splited) : 
    
    #Create a dictionary mapping each word to a unique id
    dictionary = Dictionary([text_splited])
    
    
    corpus = dictionary.doc2bow(text_splited)
    
    return corpus, dictionary

def lda(df_line):
    corpus, dictionary = df_line  # Unpacking the tuple directly
    lda_model = LdaModel([corpus], num_topics=1, id2word=dictionary)
    return lda_model

def get_topics(lda_model):
    
    return lda_model.show_topics(num_topics=1, num_words=50, formatted=True)

def extract(topics_tuple):
    topics = []
    pattern = re.compile(r'(\d+\.\d+)\*"(.*?)"')
    for topic_id, topics_str in topics_tuple:
        matches = pattern.findall(topics_str)
        for match in matches:
            freq = float(match[0])
            word = match[1]
            topics.append((word, freq))
    return topics

In [34]:
#First for loop to get the most used words for each decade

paths= ['data/wikipedia_timeline/1900s.txt', 'data/wikipedia_timeline/1910s.txt', 'data/wikipedia_timeline/1920s.txt', 'data/wikipedia_timeline/1930s.txt', 'data/wikipedia_timeline/1950s.txt', 'data/wikipedia_timeline/1960s.txt', 'data/wikipedia_timeline/1970s.txt', 'data/wikipedia_timeline/1980s.txt', 'data/wikipedia_timeline/1990s.txt', 'data/wikipedia_timeline/2000s.txt', 'data/wikipedia_timeline/2010s.txt']


for path in paths:
    decade_name = path.split('/')[-1].split('.')[0]
    decade=pd.read_csv(path, delimiter='\t', header=None)
    decade=decade.to_string(index=False, header=False)
    splitted_decade=text_split_new(decade)
    bag_decade=bag_of_words(splitted_decade)
    lda_decade=lda(bag_decade)
    print(f"Most used words for the decade {decade_name}: ",get_topics(lda_decade))
    print("the type of frequencies ", type(lda_decade.get_topics()))

Most used words for the decade 1900s:  [(0, '0.006*"World" + 0.005*"Machine" + 0.005*"Wayback" + 0.005*"Population" + 0.004*"History" + 0.003*"ISBN" + 0.003*"Data" + 0.003*"estimate" + 0.003*"flight" + 0.003*"Sheet" + 0.003*"Haub" + 0.003*"machine" + 0.002*"Empire" + 0.002*"States" + 0.002*"engine" + 0.002*"Fessenden" + 0.002*"US" + 0.002*"m" + 0.002*"pp" + 0.002*"de" + 0.002*"p" + 0.002*"Company" + 0.002*"Edison" + 0.002*"air" + 0.002*"Years" + 0.002*"decade" + 0.002*"Panama" + 0.002*"html" + 0.002*"http" + 0.002*"device" + 0.002*"Popular" + 0.002*"Time" + 0.001*"time" + 0.001*"developed" + 0.001*"Worlds" + 0.001*"earthquake" + 0.001*"car" + 0.001*"population" + 0.001*"httpsAvebarchive" + 0.001*"production" + 0.001*"paper" + 0.001*"Great" + 0.001*"Vol" + 0.001*"Radio" + 0.001*"radio" + 0.001*"estimates" + 0.001*"phonograph" + 0.001*"flights" + 0.001*"years" + 0.001*"built"')]
the type of frequencies  <class 'numpy.ndarray'>
Most used words for the decade 1910s:  [(0, '0.009*"World" + 

In [35]:
# Dictionary to store word frequencies for each decade
word_frequencies = defaultdict(lambda: defaultdict(int))

for path in paths:
    # Extract the decade from the file path
    decade_name = path.split('/')[-1].split('.')[0]
    
    # Read the file
    decade = pd.read_csv(path, delimiter='\t', header=None)
    decade = decade.to_string(index=False, header=False)
    
    # Process the text
    splitted_decade = text_split_new(decade)
    bag_decade = bag_of_words(splitted_decade)
    lda_decade = lda(bag_decade)
    
    # Get the topics and their frequencies
    topics_str = get_topics(lda_decade)
    # extract() function was written to extract the topics and their frequencies given their type
    topics = extract(topics_str)
    
    #print(f"Most used words for the decade {decade_name}: ", topics)
    # Update the word frequencies for each decade
    for word, freq in topics:
        word_frequencies[decade_name][word] = freq
        #print(word_frequencies)

# Convert the word frequencies dictionary to a DataFrame
df_word_frequencies = pd.DataFrame(word_frequencies).fillna(0)

# Transpose the DataFrame to have decades as rows and words as columns
df_word_frequencies = df_word_frequencies.T

# Display the DataFrame
print(df_word_frequencies.head())

#save in csv file
df_word_frequencies.to_csv("word_frequencies.csv")

       World  Machine  Wayback  Population  History   ISBN   Data  estimate  \
1900s  0.006    0.005    0.005       0.005    0.004  0.003  0.003     0.003   
1910s  0.009    0.000    0.000       0.000    0.000  0.002  0.000     0.000   
1920s  0.003    0.000    0.000       0.000    0.003  0.005  0.000     0.000   
1930s  0.005    0.000    0.000       0.000    0.002  0.004  0.000     0.000   
1950s  0.003    0.000    0.000       0.000    0.004  0.002  0.000     0.000   

       flight  Sheet  ...  damage  Event  toll  Country  Date  Description  \
1900s   0.003  0.003  ...     0.0    0.0   0.0      0.0   0.0          0.0   
1910s   0.000  0.000  ...     0.0    0.0   0.0      0.0   0.0          0.0   
1920s   0.000  0.000  ...     0.0    0.0   0.0      0.0   0.0          0.0   
1930s   0.000  0.000  ...     0.0    0.0   0.0      0.0   0.0          0.0   
1950s   0.000  0.000  ...     0.0    0.0   0.0      0.0   0.0          0.0   

        Al  Typhoon  Sudan  Syria  
1900s  0.0      0.0 