In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import spacy
#spacy.cli.download("en_core_web_sm")

In [4]:
data_folder = './Data/'
plot_summaries = pd.read_csv(data_folder+'plot_summaries.txt',sep='\t', header=None, names=['wikipedia_ID', 'plot_summary'] )
df = plot_summaries[:10]

"""
for ind in range(10):
    print()
    print(plot_summaries.iloc[ind]['plot_summary'])
"""

df.head()

Unnamed: 0,wikipedia_ID,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


### From Chatgpt

In [None]:
from tqdm import tqdm
import re
import string
from itertools import combinations
from collections import Counter

In [20]:
# Text preprocessing and TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['plot_summary'])
print("vectorising done") # 10 seconds

# Named Entity Recognition using SpaCy
nlp = spacy.load("en_core_web_sm")
print("nlp model loading done")

def extract_characters(text):
    doc = nlp(text)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    characters = []
    for name in names:
        characters.append(name.translate(str.maketrans('', '', string.punctuation)))

    ordered_characters = Counter(characters).most_common()

    return ordered_characters

"""
df['characters'] = df['plot_summary'].apply(extract_characters)

for ind in range(10):
    print()
    print(df.iloc[ind]['characters'])

"""

print(extract_characters(df.iloc[2]['plot_summary']))


vectorising done
nlp model loading done
[('Indulekha', 5), ('Menon', 4), ('Manapally Madhavan Nambiar', 2), ('Manapally', 2), ('Anuradha', 2), ('Poovalli Induchoodan', 1), ('Karunakara Menon', 1), ('DYSP Sankaranarayanan', 1), ('Manapally Sudheeran', 1), ('Moopil Nair', 1), ('Nambiar', 1), ('Mooppil Nair', 1), ('Kanaka', 1), ('Manapally Pavithran', 1), ('Jayakrishnan', 1), ('Raman Nair', 1), ('Nandagopal Maarar', 1)]


### From Aveek's Blog
https://home.aveek.io/blog/post/finding-main-characters/

In [1]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
 
from tqdm import tqdm
import re
import string
from itertools import combinations
from collections import Counter
 
 
from flair.models import SequenceTagger
from flair.data import Sentence

[nltk_data] Downloading package stopwords to /home/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/julian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
tagger = SequenceTagger.load('ner')

def extract_characters(summary:str):
    sentences = sent_tokenize(summary)
    x = []
    for line in tqdm(sentences):
        sentence = Sentence(line)
        tagger.predict(sentence)
        for entity in sentence.to_dict(tag_type='ner')['entities']:
            if entity['labels'][0]['value'] == 'PER':
                x.append(entity['text'])
    names = []
    for name in x:
        names.append(name.translate(str.maketrans('', '', string.punctuation)))

    ordered_names = Counter(names).most_common()
 
    return ordered_names


print(extract_characters(df.iloc[2]['plot_summary']))

2023-11-12 12:56:16,800 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


100%|██████████| 26/26 [00:28<00:00,  1.08s/it]

[('Induchoodan', 17), ('Menon', 9), ('Indulekha', 6), ('Manapally Pavithran', 3), ('Pavithran', 3), ('Manapally Madhavan Nambiar', 2), ('Manapally', 2), ('Anuradha', 2), ('Justice Menon', 2), ('Poovalli Induchoodan', 1), ('Justice Maranchery Karunakara Menon', 1), ('DYSP Sankaranarayanan', 1), ('Manapally Sudheeran', 1), ('Saikumar', 1), ('Ramakrishnan', 1), ('Moopil Nair', 1), ('Nambiar', 1), ('Aishwarya', 1), ('Mooppil Nair', 1), ('Kanaka', 1), ('Chandrabhanu', 1), ('Jayakrishnan', 1), ('Raman Nair', 1), ('Nandagopal Maarar', 1)]





Chatgpt optimized version of Aveek's code

In [25]:
tagger = SequenceTagger.load('ner')

def extract_characters(summary: str):

    # Extract and tag senteces from summaries
    sentences = sent_tokenize(summary)
    tagged_sentences = [Sentence(sent) for sent in sentences]
    tagger.predict(tagged_sentences)

    # Extract all characters from the tagged sentences
    entities = [entity for sent in tagged_sentences for entity in sent.to_dict(tag_type='ner')['entities']]
    characters = [entity['text'] for entity in entities if entity['labels'][0]['value'] == 'PER']

    # Remove punctuation and order names
    characters = [name.translate(str.maketrans('', '', string.punctuation)) for name in characters]
    ordered_characters = Counter(characters).most_common()

    return ordered_characters

df['characters'] = df['plot_summary'].apply(extract_characters)

for ind in range(10):
    print()
    print(df.iloc[ind]['characters'])

2023-11-12 13:04:44,471 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>

[('Shlykov', 1), ('Lyosha', 1)]

[('Katniss', 24), ('Peeta', 14), ('Rue', 9), ('Cato', 4), ('Haymitch', 3), ('Crane', 3), ('Clove', 3), ('Snow', 2), ('Thresh', 2), ('Peeta Mellark', 1), ('Haymitch Abernathy', 1), ('Caesar Flickerman', 1), ('Glimmer', 1), ('Seneca Crane', 1), ('Foxface', 1)]

[('Induchoodan', 17), ('Menon', 9), ('Indulekha', 6), ('Manapally Pavithran', 3), ('Pavithran', 3), ('Manapally Madhavan Nambiar', 2), ('Manapally', 2), ('Anuradha', 2), ('Justice Menon', 2), ('Poovalli Induchoodan', 1), ('Justice Maranchery Karunakara Menon', 1), ('DYSP Sankaranarayanan', 1), ('Manapally Sudheeran', 1), ('Saikumar', 1), ('Ramakrishnan', 1), ('Moopil Nair', 1), ('Nambiar', 1), ('Aishwarya', 1), ('Mooppil Nair', 1), ('Kanaka', 1), ('Chandrabhanu', 1), ('Jayakrishnan', 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['characters'] = df['plot_summary'].apply(extract_characters)


In [27]:
len(plot_summaries)

42303