In [11]:
# necessary imports
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from pprint import pprint
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import matplotlib.pyplot as plt
import nltk
from collections import defaultdict
import numpy as np

In [12]:
# Script to load the notebook utils.ipynb
import nbformat
from IPython.core.interactiveshell import InteractiveShell

# Load the notebook utils.ipynb
with open('../scripts/utils.ipynb') as f:
    nb = nbformat.read(f, as_version=4)

# Create an instance of InteractiveShell
shell = InteractiveShell.instance()

# Execute the notebook utils.ipynb
for cell in nb.cells:
    if cell.cell_type == 'code':
        shell.run_cell(cell.source)

In [13]:
#importing necessary files
merged_df = pd.read_csv('merged_df.tsv', sep='\t')
plot_summary_df = pd.read_csv('plot_summaries.txt', sep='\t')

Sorting the summaries in plot_summaries dataset by decade

In [14]:

#Estimation of the release year of the films without release data
mean_release_year_by_genre = pd.read_csv('mean_release_year_by_genre.tsv', sep='\t')
merged_df['Estimated_release_year'] = merged_df.apply(estimate_release_year, axis=1, args=(mean_release_year_by_genre,))
merged_df['Estimated_release_year'] = pd.to_datetime(merged_df['Estimated_release_year'], errors='coerce').dt.year
merged_df['Decade'] = (merged_df['Estimated_release_year'] // 10) * 10

In [15]:
#initializing decades 
decades=np.arange(1900,2020,10).astype(float)

# extracting a list of the movie ids for each decade
movies_ids_per_decade={} # dict of the lists of movie ids for all decades
for decade in decades:
    movies_df=merged_df[merged_df['Decade']==decade]
    decade_ids=movies_df['Wikipedia_movie_ID'].to_list()
    decade_ids=list(dict.fromkeys(decade_ids))
    movies_ids_per_decade[decade]=decade_ids

# creating a list of the summaries of the decade for each decade
summaries_per_decade={} # dict of the lists of movie summaries for all decades
for decade in decades :
    summaries_list=[]
    for id in movies_ids_per_decade[decade] :
        summary=plot_summary_df[plot_summary_df['movie_id']==id]['plot_summary'].to_list() # extracting the plot summaries from the movie ids in movies_ids_per_decade
        if(summary!=[]):
            summaries_list.append(summary[0])
    summaries_per_decade[decade]=summaries_list

In [16]:
with open('summaries_per_decade.txt', 'w') as fichier:
    for key, valeur in summaries_per_decade.items():
        fichier.write(f'{key}: {valeur}\n')

Applying LDA to the wikipedia timeline

In [4]:
#First for loop to get the most used words for each decade

paths= ['wikipedia_timeline/1900s.txt', 'wikipedia_timeline/1910s.txt', 'wikipedia_timeline/1920s.txt', 'wikipedia_timeline/1930s.txt', 'wikipedia_timeline/1940s.txt', 'wikipedia_timeline/1950s.txt', 'wikipedia_timeline/1960s.txt', 'wikipedia_timeline/1970s.txt', 'wikipedia_timeline/1980s.txt', 'wikipedia_timeline/1990s.txt', 'wikipedia_timeline/2000s.txt', 'wikipedia_timeline/2010s.txt']


for path in paths:
    # Extract the decade from the file path
    decade_name = path.split('/')[-1].split('.')[0]
    
    # Read the file
    decade=pd.read_csv(path, delimiter='\t', header=None)
    decade=decade.to_string(index=False, header=False)
    
    # Process the text
    splitted_decade=text_split_new(decade)
    bag_decade=bag_of_words(splitted_decade)
    lda_decade=lda_new(bag_decade)
    
    # print the topics and their frequencies
    print(f"Most used words for the decade {decade_name}: ",get_topics_new(lda_decade))
    print("the type of frequencies ", type(lda_decade.get_topics()))

Most used words for the decade 1900s:  [(0, '0.010*"W" + 0.007*"httpwww" + 0.004*"Machine" + 0.004*"Wayback" + 0.004*"Population" + 0.004*"V" + 0.003*"History" + 0.003*"orld" + 0.003*"T" + 0.003*"ISBN" + 0.003*"Data" + 0.003*"flight" + 0.002*"estimate" + 0.002*"de" + 0.002*"States" + 0.002*"s" + 0.002*"Sheet" + 0.002*"Haub" + 0.002*"machine" + 0.002*"Empire" + 0.002*"decade" + 0.002*"Fessenden" + 0.002*"US" + 0.002*"f" + 0.002*"engine" + 0.002*"p" + 0.002*"pp" + 0.002*"http" + 0.002*"Company" + 0.002*"Edison" + 0.002*"people" + 0.002*"air" + 0.002*"m" + 0.001*"Popular" + 0.001*"Great" + 0.001*"Years" + 0.001*"World" + 0.001*"time" + 0.001*"Panama" + 0.001*"device" + 0.001*"production" + 0.001*"population" + 0.001*"S" + 0.001*"world" + 0.001*"paper" + 0.001*"car" + 0.001*"years" + 0.001*"part" + 0.001*"earthquake" + 0.001*"httpswww"')]
the type of frequencies  <class 'numpy.ndarray'>
Most used words for the decade 1910s:  [(0, '0.015*"W" + 0.007*"ar" + 0.007*"World" + 0.005*"V" + 0.005*

In [6]:
# Dictionary to store word frequencies for each decade
word_frequencies = defaultdict(lambda: defaultdict(int))

for path in paths:
    # Extract the decade from the file path
    decade_name = path.split('/')[-1].split('.')[0]
    
    # Read the file
    decade = pd.read_csv(path, delimiter='\t', header=None)
    decade = decade.to_string(index=False, header=False)
    
    # Process the text
    splitted_decade = text_split_new(decade)
    bag_decade = bag_of_words(splitted_decade)
    lda_decade = lda_new(bag_decade)
    
    # Get the topics and their frequencies
    topics_str = get_topics_new(lda_decade)
    # extract() function was written to extract the topics and their frequencies given their type
    topics = extract_new(topics_str)
    
    #print(f"Most used words for the decade {decade_name}: ", topics)
    # Update the word frequencies for each decade
    for word, freq in topics:
        word_frequencies[decade_name][word] = freq
        #print(word_frequencies)

# Convert the word frequencies dictionary to a DataFrame
df_word_frequencies = pd.DataFrame(word_frequencies).fillna(0)

# Transpose the DataFrame to have decades as rows and words as columns
df_word_frequencies = df_word_frequencies.T

# Display the DataFrame
print(df_word_frequencies.head())

#save in csv file
df_word_frequencies.to_csv("word_frequencies_wikipedia.csv")

           W  httpwww  Machine  Wayback  Population      V  History   orld  \
1900s  0.010    0.007    0.004    0.004       0.004  0.004    0.003  0.003   
1910s  0.015    0.000    0.000    0.000       0.000  0.005    0.000  0.002   
1920s  0.012    0.002    0.000    0.000       0.000  0.002    0.002  0.000   
1930s  0.012    0.002    0.000    0.000       0.000  0.003    0.001  0.000   
1940s  0.013    0.000    0.000    0.000       0.000  0.002    0.000  0.002   

           T   ISBN  ...  Reuters  fire  ISSN  Wii  Iraq  CNN  Death  \
1900s  0.003  0.003  ...      0.0   0.0   0.0  0.0   0.0  0.0    0.0   
1910s  0.003  0.002  ...      0.0   0.0   0.0  0.0   0.0  0.0    0.0   
1920s  0.003  0.003  ...      0.0   0.0   0.0  0.0   0.0  0.0    0.0   
1930s  0.003  0.002  ...      0.0   0.0   0.0  0.0   0.0  0.0    0.0   
1940s  0.000  0.000  ...      0.0   0.0   0.0  0.0   0.0  0.0    0.0   

       Nintendo  Country  Philippines  
1900s       0.0      0.0          0.0  
1910s       0.0   

Applying LDA to summaries_per_decade (plot_summaries dataset by decade sorted by decade c.f. above) 

In [17]:
# extract from the txt file the decades

summaries_per_decade = pd.read_csv('summaries_per_decade.txt', delimiter='\t', header=None)

#initialization summaries splitted per decades
summaries_1900s=summaries_per_decade[0][0]
summaries_1910s=summaries_per_decade[0][1]
summaries_1920s=summaries_per_decade[0][2]
summaries_1930s=summaries_per_decade[0][3]
summaries_1940s=summaries_per_decade[0][4]
summaries_1950s=summaries_per_decade[0][5]
summaries_1960s=summaries_per_decade[0][6]
summaries_1970s=summaries_per_decade[0][7]
summaries_1980s=summaries_per_decade[0][8]
summaries_1990s=summaries_per_decade[0][9]
summaries_2000s=summaries_per_decade[0][10]
summaries_2010s=summaries_per_decade[0][11]



In [None]:
#intializing a 'decade' list
decades = [summaries_1900s,summaries_1910s,summaries_1920s,summaries_1930s,summaries_1940s,summaries_1950s,summaries_1960s,summaries_1970s,summaries_1980s,summaries_1990s,summaries_2000s,summaries_2010s]

#initializing decades name
decade_name = [1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010]
i=0

for decade in decades:
    # Process the text
    splitted_decade=text_split_new(decade)
    bag_decade=bag_of_words(splitted_decade)
    lda_decade=lda_new(bag_decade)
    
    # print the topics and their frequencies
    print(f"Most used words for the decade {decade_name[i]}: ",get_topics_new(lda_decade))
    print("the type of frequencies ", type(lda_decade.get_topics()))
    i+=1

Most used words for the decade 1900:  [(0, '0.017*"film" + 0.008*"Pedro" + 0.007*"girl" + 0.006*"Mr" + 0.006*"travelers" + 0.006*"astronomers" + 0.005*"submarine" + 0.005*"shot" + 0.005*"Hyde" + 0.005*"man" + 0.005*"Jekyll" + 0.005*"train" + 0.005*"police" + 0.004*"version" + 0.004*"front" + 0.004*"scene" + 0.004*"journey" + 0.004*"border" + 0.004*"help" + 0.004*"town" + 0.004*"capsule" + 0.004*"woman" + 0.004*"ice" + 0.004*"Ned" + 0.004*"space" + 0.004*"time" + 0.004*"sequence" + 0.004*"moves" + 0.004*"Selenite" + 0.003*"bar" + 0.003*"attacks" + 0.003*"ground" + 0.003*"Selenites" + 0.003*"rescues" + 0.003*"proposes" + 0.003*"board" + 0.003*"Henri" + 0.003*"automobile" + 0.003*"rescuer" + 0.003*"Story" + 0.003*"gang" + 0.003*"women" + 0.003*"thug" + 0.003*"Alice" + 0.003*"home" + 0.003*"sun" + 0.003*"customer" + 0.003*"begins" + 0.003*"capture" + 0.003*"sheriff"')]
the type of frequencies  <class 'numpy.ndarray'>
Most used words for the decade 1910:  [(0, '0.005*"man" + 0.005*"love" + 

In [19]:
# Dictionary to store word frequencies for each decade
word_frequencies_summaries = defaultdict(lambda: defaultdict(int))
i=0

for decade in decades:
    # Process the text
    splitted_decade = text_split_new(decade)
    bag_decade = bag_of_words(splitted_decade)
    lda_decade = lda_new(bag_decade)
    
    # Get the topics and their frequencies
    topics_str = get_topics_new(lda_decade)
    # extract() function was written to extract the topics and their frequencies given their type
    topics = extract_new(topics_str)
    
    #print(f"Most used words for the decade {decade_name}: ", topics)
    # Update the word frequencies for each decade
    for word, freq in topics:
        word_frequencies_summaries[decade_name[i]][word] = freq
    i+=1

# Convert the word frequencies dictionary to a DataFrame
df_word_frequencies = pd.DataFrame(word_frequencies_summaries).fillna(0)

# Transpose the DataFrame to have decades as rows and words as columns
df_word_frequencies = df_word_frequencies.T

# Display the DataFrame
print(df_word_frequencies.head())

#save in csv file
df_word_frequencies.to_csv("word_frequencies_summaries.csv")

       film  Pedro   girl  travelers     Mr  astronomers   shot    man  train  \
1900  0.017  0.008  0.007      0.006  0.006        0.006  0.005  0.005  0.005   
1910  0.004  0.000  0.004      0.000  0.000        0.000  0.000  0.005  0.000   
1920  0.004  0.000  0.002      0.000  0.000        0.000  0.000  0.005  0.000   
1930  0.002  0.000  0.000      0.000  0.002        0.000  0.000  0.003  0.000   
1940  0.002  0.000  0.000      0.000  0.000        0.000  0.000  0.003  0.001   

       Hyde  ...  friends  kills  meet  school  body  attempts  ends  reveals  \
1900  0.005  ...      0.0    0.0   0.0     0.0   0.0       0.0   0.0      0.0   
1910  0.000  ...      0.0    0.0   0.0     0.0   0.0       0.0   0.0      0.0   
1920  0.000  ...      0.0    0.0   0.0     0.0   0.0       0.0   0.0      0.0   
1930  0.000  ...      0.0    0.0   0.0     0.0   0.0       0.0   0.0      0.0   
1940  0.000  ...      0.0    0.0   0.0     0.0   0.0       0.0   0.0      0.0   

      team  people  
1900 