In [7]:
import numpy as np
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.2")

for decade in np.arange(1900,2010,10):

    with open(f'src/data/movies_summaries/summaries_decade_{decade}.txt',"r") as file:
        string=file.read()
    summaries=string.splitlines()
    print(len(summaries))

13
211
454
1120
1144
1335
1111
1305
2076
3260
7001


In [10]:
with open(f'src/data/movies_summaries/summaries_decade_1910.txt',"r") as file:
    string=file.read()
summaries=string.splitlines()
length=len(summaries)
selected=summaries[4::5]
print(len(selected))
print(len(summaries))

42
211


In [11]:
from torch.quantization import quantize_dynamic
from sentence_transformers import SentenceTransformer, util, InputExample, losses
import pandas as pd

In [12]:
# Load SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

In [13]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mariannebenard/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.cluster import MiniBatchKMeans
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, silhouette_score
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mariannebenard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# Function to read and preprocess text data
def read_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text=text.replace('\n','')
        split_text=text.split("See also",-1) # remove the references
        text=split_text[0]
    sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Load dataset
movies_sentences = read_and_preprocess('src/data/movies_summaries/summaries_decade_1940')

In [15]:
# Encode sentences into embeddings
movies_embeddings = model.encode(movies_sentences, batch_size=32, convert_to_tensor=True)

In [18]:
best_clust_nums=np.zeros(20)

num_clusters = np.arange(1,20,1)+1
sil_scores= np.zeros(len(num_clusters))
for i in np.arange(len(num_clusters)):
  cluster_model = MiniBatchKMeans(n_clusters=num_clusters[i])
  cluster_labels = cluster_model.fit_predict(movies_embeddings.cpu().numpy())
  sil_score = silhouette_score(movies_embeddings.cpu().numpy(), cluster_labels)
  sil_scores[i]= sil_score
best_clust_num=num_clusters[np.argmax(sil_scores)] # number of clusters with the best silhouette score


In [19]:
print(best_clust_num)

4


In [21]:
cluster_model = MiniBatchKMeans(n_clusters=best_clust_num)
movies_clusters = cluster_model.fit_predict(movies_embeddings.cpu().numpy())

In [22]:
# Cluster sentences from history into semantic fields
clustered_movies = {i: [] for i in range(max(movies_clusters) + 1)}
for idx, cluster_id in enumerate(movies_clusters):
    clustered_movies[cluster_id].append(movies_sentences[idx])

In [25]:
for idx, cluster in clustered_movies.items():
    print(cluster)
    print(len(cluster))

3015
4468
3060
4937


In [33]:
cluster_1=''
for sentence in clustered_movies[1]:
    cluster_1+=sentence
print(cluster_1)
print(len(cluster_1))

496531


In [30]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain_ollama import OllamaLLM

In [35]:


llm = OllamaLLM(model="llama3.2")
# Initialize memory to keep the conversation context
memory = ConversationBufferMemory()

conversation = ConversationChain(llm=llm, memory=memory)

# Create a conversational chain with the model and memory

response1=conversation.run(cluster_1)
print(response1)

response2=conversation.run("extract the main recurring characteristics of the characters from those sentences")
print(response2)

  conversation = ConversationChain(llm=llm, memory=memory)
  response1=conversation.run(cluster_1)


This text appears to be a list of plot summaries or descriptions from various films, likely from the 1930s. The entries are brief and concise, and each describes the main events or conflicts of a particular story. They seem to be drawn from a variety of genres, including horror, comedy, mystery, and adventure.

Some notable examples include:

* "The Ghost Train" - a story about a woman who is convinced that she has seen a ghost train, but may actually be suffering from delusions.
* "The Corpse Came Back for Annabel" - a tale about a young woman named Jackie who becomes embroiled in a mystery involving a corpse and a romantic interest.
* "The Stooges" - a story about two men, Mac and Sand, who become embroiled in a series of misadventures with their wives and step-daughters.
* "The Adventures of Casper" - a tale about a dog named Casper who sits beside his grave reading a book on animal friends.

Overall, the text seems to be a collection of short descriptions or summaries from various 

In [36]:
response3=conversation.run("create a fictitious character that combines the characteristics from those characters")
print(response3)

Here's a fictitious character that combines elements of the recurring characteristics identified:

**Name:** Reginald "Reg" Windsor

**Characteristics:**

* **Unreliable narrator**: Reg has a tendency to misremember or distort events, often due to his own insecurities and emotional baggage.
* **Misunderstandings and mistaken identities**: Reg frequently misunderstands social cues and interactions, leading to awkward situations and comedic misunderstandings.
* **Mysterious past**: Reg has an unexplained past that he's reluctant to discuss, which leads to suspicions and curiosity from those around him.
* **Romantic interests and relationships**: Reg is a hopeless romantic, but his clumsiness and tendency to misinterpret social cues often lead to awkward dating situations and misunderstandings with women.

**Personality:**

Reg is a well-meaning, yet awkward, individual who often finds himself in humorous situations. He's prone to daydreaming and gets lost in his own thoughts, which can l

In [47]:
from tqdm import tqdm

for decade in [1900,1910,1920,1930,1950,1960,1970,1990,2000]:
  print(decade)
  # Load dataset
  movies_sentences = read_and_preprocess(f'src/data/movies_summaries/summaries_decade_{decade}.txt')

  # Encode sentences into embeddings
  movies_embeddings = model.encode(movies_sentences, batch_size=32, convert_to_tensor=True)

  # Calculating the optimal number of clusters
  best_clust_nums=np.zeros(20)
  num_clusters = np.arange(1,20,1)+1
  sil_scores= np.zeros(len(num_clusters))
  for i in tqdm(np.arange(len(num_clusters))):
    cluster_model = MiniBatchKMeans(n_clusters=num_clusters[i])
    cluster_labels = cluster_model.fit_predict(movies_embeddings.cpu().numpy())
    sil_score = silhouette_score(movies_embeddings.cpu().numpy(), cluster_labels)
    sil_scores[i]= sil_score
  best_clust_num=num_clusters[np.argmax(sil_scores)] # number of clusters with the best silhouette score
  print(best_clust_num)

  # separating the movie summaries into clusters
  cluster_model = MiniBatchKMeans(n_clusters=best_clust_num)
  movies_clusters = cluster_model.fit_predict(movies_embeddings.cpu().numpy())
  clustered_movies = {i: [] for i in range(max(movies_clusters) + 1)}
  for idx, cluster_id in enumerate(movies_clusters):
      clustered_movies[cluster_id].append(movies_sentences[idx])

  for idx in tqdm(np.arange(best_clust_num)):
      sentences=''
      for sentence in clustered_movies[idx]: sentences+=sentence

      memory = ConversationBufferMemory()
      memory.clear()

      conversation = ConversationChain(llm=llm, memory=memory)

      # Create a conversational chain with the model and memory

      response1=conversation.run(sentences)

      response2=conversation.run("extract the main recurring characteristics of the characters from those sentences")

      response3=conversation.run("create a fictitious character that combines the characteristics from those characters. Indicate their background, age, physical appearance, occupation, personality and daily life.")
      
      with open (f'data/character_descriptions/{decade}s/cluster_{idx}.txt',"w") as file:
          file.write(response3)
    

1900


100%|██████████| 19/19 [00:01<00:00, 12.72it/s]


2


100%|██████████| 2/2 [02:18<00:00, 69.18s/it]


1910


100%|██████████| 19/19 [00:07<00:00,  2.48it/s]


2


100%|██████████| 2/2 [03:08<00:00, 94.42s/it] 


1920


100%|██████████| 19/19 [00:10<00:00,  1.88it/s]


2


100%|██████████| 2/2 [02:35<00:00, 77.84s/it]


1930


100%|██████████| 19/19 [00:54<00:00,  2.85s/it]


2


100%|██████████| 2/2 [02:17<00:00, 68.68s/it]


1950


100%|██████████| 19/19 [02:08<00:00,  6.75s/it]


2


100%|██████████| 2/2 [02:46<00:00, 83.48s/it]


1960


100%|██████████| 19/19 [01:39<00:00,  5.25s/it]


2


100%|██████████| 2/2 [03:14<00:00, 97.02s/it] 


1970


100%|██████████| 19/19 [01:53<00:00,  5.97s/it]


2


100%|██████████| 2/2 [03:03<00:00, 91.82s/it]


1990


100%|██████████| 19/19 [06:11<00:00, 19.57s/it]


2


100%|██████████| 2/2 [02:46<00:00, 83.32s/it] 


2000


100%|██████████| 19/19 [03:20<00:00, 10.58s/it]


2


100%|██████████| 2/2 [02:50<00:00, 85.42s/it]
