In [20]:
import nltk
from pprint import pprint
from operator import itemgetter
from nltk.corpus import framenet as fn
from nltk.corpus.reader.framenet import PrettyList

#### POS
Fine-grained but not meaningful.

In [2]:
# Define a sentence
sentence = "The cat sat on the mat."

# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)

# Tag the tokens with part-of-speech (POS) tags
pos_tags = nltk.pos_tag(tokens)

# Display the POS tags
pos_tags

[('The', 'DT'),
 ('cat', 'NN'),
 ('sat', 'VBD'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('mat', 'NN'),
 ('.', '.')]

#### Frames
Yes:)

In [3]:
x = fn.frames(r'(?i)crim')
x.sort(key=itemgetter('ID'))
x

[<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]

In [4]:
fn.frames()

[<frame ID=2031 name=Abandonment>, <frame ID=262 name=Abounding_with>, ...]

# Frame Semantic Transformer (2023)

In [1]:
from frame_semantic_transformer import FrameSemanticTransformer

In [2]:
frame_transformer = FrameSemanticTransformer()

In [15]:
def print_frame_result(results):
    """
    Format output of frame result.
    """
    print(f"Results found in: {results.sentence}")

    for frame in results.frames:
        print(f"FRAME: {frame.name}")
        for element in frame.frame_elements:
            print(f"{element.name}: {element.text}")

In [29]:
sentence = "The hallway smelt of boiled cabbage and old rag mats."
results = frame_transformer.detect_frames(sentence)

In [30]:
print_frame_result(results)

Results found in: The hallway smelt of boiled cabbage and old rag mats.
FRAME: Connecting_architecture
Part: hallway
FRAME: Give_impression
Phenomenon: The hallway
Characterization: of boiled cabbage and old rag mats
FRAME: Apply_heat
Food: cabbage
FRAME: Food
Food: cabbage
FRAME: Age
Entity: rag mats


In [31]:
# Just print frame names
for frame in results.frames:
    print(frame.name)

Connecting_architecture
Give_impression
Apply_heat
Food
Age


In [17]:
# Apply to a sentence relevant to my corpus
sentence = "Each of us was subjected to a full body search and stripped of all valuables, two layers of underwear, and all our luggage and papers."
results = frame_transformer.detect_frames(sentence)
print_frame_result(results)

Results found in: Each of us was subjected to a full body search and stripped of all valuables, two layers of underwear, and all our luggage and papers.
FRAME: Body_parts
Body_part: body
FRAME: Measure_duration
Count: two
Unit: layers
Process: of underwear
FRAME: Documents
Bearer: our
Document: papers


In [18]:
sentence = "This early approach gave those who had not been on the scene an inside look into what actually happened during the Holocaust."
results = frame_transformer.detect_frames(sentence)
print_frame_result(results)

Results found in: This early approach gave those who had not been on the scene an inside look into what actually happened during the Holocaust.
FRAME: Relative_time
Focal_occasion: approach
FRAME: Means
Descriptor: early
FRAME: Giving
Donor: This early approach
Recipient: those who had not been on the scene
Theme: an inside look into what actually happened during the Holocaust
FRAME: Perception_active
Depictive: inside
Phenomenon: into what actually happened during the Holocaust
FRAME: Event
Event: what
Time: during the Holocaust


These are interesting Frames, perhaps more distinctive to this Abstracts corpus:
* Relative_time
* Giving
* Perception_active (! - expected but still cool)
* Event

In [21]:
x = fn.frames(r'Perception_active')
x.sort(key=itemgetter('ID'))
x

[<frame ID=66 name=Perception_active>]

In [24]:
frame_look = fn.frame(66)
for fe in frame_look.FE:
    print(fe)

Perceiver_agentive
Phenomenon
Body_part
Location_of_protagonist
Direction
Depictive
State
Manner
Means
Time
Purpose
Place
Duration
Expected_entity
Ground
Obscuring_medium


In [25]:
sentence = "Diagnosis and Phase-Oriented Treatment of Post-Traumatic Stress Disorder."
results = frame_transformer.detect_frames(sentence)
print_frame_result(results)

Results found in: Diagnosis and Phase-Oriented Treatment of Post-Traumatic Stress Disorder.
FRAME: Cure
Affliction: of Post-Traumatic Stress Disorder
FRAME: Cure
Affliction: of Post-Traumatic Stress Disorder


### Apply FST for Corpuses

#### <font color="salmon"> Process:
1. Tokenize ALL DOCUMENTS to sentence level
2. Test batch transformation
3. Detect frames
    * append all frame.names to a list
    * append list to df cell
4. Pick out most salient themes...if necessary (don't remove dupes)
5. Visualize:
    * By grouping - scatterplot
    * Frequency (compare 2 corpus)

#### Resulting DF Desired (columns):
* ID
* text
* embedding
* list of Frames
* list of salient frames

#### 0. Prepare data

In [56]:
import pandas as pd

In [34]:
abstracts = pd.read_csv("/Users/ez/desktop/text_memory_and_identity/data/abstracts.csv", index_col=0)
eyewitness = pd.read_csv("/Users/ez/desktop/text_memory_and_identity/data/eyewitness.csv", index_col=0)

In [38]:
eyewitness = eyewitness.loc[:, ['Index Number', 'Text', 'embedding']]

In [39]:
eyewitness[:3]

Unnamed: 0,Index Number,Text,embedding
0,P.I.a. No. 62,"The Jews in Pomerania I, Dr. Ernst Alban, here...","[-0.01598668470978737, 0.03931468725204468, 0...."
1,P.I.a. No. 115,Reminiscences Regarding the Youth of Dr. Josep...,"[0.0071619655936956406, 0.022604744881391525, ..."
2,P.I.a. No. 317,Extracts from a letter from Ernest Weil to Rob...,"[-0.04701196402311325, 0.025713954120874405, 0..."


In [40]:
abstracts[:2]

Unnamed: 0,paper_ID,abstract,embedding
0,329f5441ffcbdc970ea5868ad27aae13c212ea08,"In April 1983, the first American Gathering of...","[0.025907130911946297, 0.012855629436671734, 0..."
1,4cf5d504f0ccca7da2a65900d2c48f5a1b99f620,Deviating from foundational assumptions regard...,"[0.05267952382564545, 0.08909037709236145, -0...."


In [42]:
abstracts.rename(columns={'paper_ID': 'id', 'abstract': 'text'}, inplace=True)
eyewitness.rename(columns={'Index Number': 'id', 'Text': 'text'}, inplace=True)

In [51]:
merged_df = pd.concat([abstracts, eyewitness])
merged_df.reset_index(inplace=True, drop=True)

#### 1. Tokenize ALL DOCUMENTS to sentence lvl

* Iterate through TEXT col, create sentence tokens
* Test out bulk frame detection & determine batch size

**Tokenize**

In [89]:
import nltk
import random

In [55]:
merged_df[:1]

Unnamed: 0,id,text,embedding
0,329f5441ffcbdc970ea5868ad27aae13c212ea08,"In April 1983, the first American Gathering of...","[0.025907130911946297, 0.012855629436671734, 0..."


In [58]:
# Split text into sentences and create new rows
merged_df['sents_token'] = merged_df['text'].apply(lambda x: nltk.sent_tokenize(x))

**Frame Detection!! - Preprocessing**

In [80]:
# Initiate frame transformer object
frame_transformer = FrameSemanticTransformer(batch_size=3)

In [None]:
# for cell in sample['sents_token']:
#     for sent in cell:
#         result = frame_transformer.detect_frames(sent)
#         for frame in result.frames:
#             print(frame.name)

Next steps:
* Take random subset of 350 docs for each corpus type
* Take 3 random sentences for each doc (or less/n if there aren't 3 sentences)
* Apply frame transformer. Use BULK and set batch_size = 3 

In [92]:
abstracts_subset = abstracts.sample(n=350, random_state=42)
eyewitness_subset = eyewitness.sample(n=350, random_state=42)

merged_df = pd.concat([abstracts_subset, eyewitness_subset])
merged_df.reset_index(inplace=True, drop=True)

# Split text into sentences and create new rows
merged_df['sents_token'] = merged_df['text'].apply(lambda x: nltk.sent_tokenize(x))

In [94]:
merged_df[:2]

Unnamed: 0,id,text,embedding,sents_token
0,03299a0bcddf7cec7bb8587c87af598a30cf6f24,Marceline Loridan-Ivens may be best known for ...,"[0.0129690608009696, 0.04015115275979042, -0.0...",[Marceline Loridan-Ivens may be best known for...
1,adb692013a60e1a7eb79ca4759e92fd5245a4f2d,"How do extraordinary experiences, especially d...","[-0.01149928942322731, 0.04745658487081528, -0...","[How do extraordinary experiences, especially ..."


In [101]:
# Take random sample of sentence tokens from the DF

# Specify size
sample_size = 3

# Function to take a random sample of sentences and return as a list
def sample_sentences(sentence_list):
    return random.sample(sentence_list, min(len(sentence_list), sample_size))

In [102]:
# Apply the function to each cell in the 'sents_token' column
merged_df['sents_sample'] = merged_df['sents_token'].apply(lambda x: sample_sentences(x))

**Detect Frames**

In [None]:
# Test SAMPLE
# This should take about 1m30s, according to our 3 sentence rule.

# Assuming merged_df is your DataFrame and 'sents_sample' is the column containing the sampled sentences
sample_df = merged_df[:2].copy()  # Copy the first 2 rows of merged_df to sample_df

# Create an empty 'frames' column
sample_df['frames'] = None

# Loop over each index and cell in the 'sents_sample' column of sample_df
for i, cell in enumerate(sample_df['sents_sample']):
    result = frame_transformer.detect_frames_bulk(cell)
    frames = [frame.name for r in result for frame in r.frames]
    sample_df.at[sample_df.index[i], 'frames'] = frames  

# Print the modified sample_df
# print(sample_df)

In [130]:
sample_df[:2]

Unnamed: 0,id,text,embedding,sents_token,sents_sample,frames
0,03299a0bcddf7cec7bb8587c87af598a30cf6f24,Marceline Loridan-Ivens may be best known for ...,"[0.0129690608009696, 0.04015115275979042, -0.0...",[Marceline Loridan-Ivens may be best known for...,"[She was also a Holocaust survivor, who return...","[Perception_experience, Text_creation, Means, ..."
1,adb692013a60e1a7eb79ca4759e92fd5245a4f2d,"How do extraordinary experiences, especially d...","[-0.01149928942322731, 0.04745658487081528, -0...","[How do extraordinary experiences, especially ...",[He then conducts semi-structured interviews w...,"[Discussion, Participation, Purpose, Scrutiny,..."


In [129]:
merged_df[:2]

Unnamed: 0,id,text,embedding,sents_token,sents_sample
0,03299a0bcddf7cec7bb8587c87af598a30cf6f24,Marceline Loridan-Ivens may be best known for ...,"[0.0129690608009696, 0.04015115275979042, -0.0...",[Marceline Loridan-Ivens may be best known for...,"[She was also a Holocaust survivor, who return..."
1,adb692013a60e1a7eb79ca4759e92fd5245a4f2d,"How do extraordinary experiences, especially d...","[-0.01149928942322731, 0.04745658487081528, -0...","[How do extraordinary experiences, especially ...",[He then conducts semi-structured interviews w...


In [131]:
# Run for all text!

# Create an empty 'frames' column
merged_df['frames'] = None

# Loop over each index and cell in the 'sents_sample' column of merged_df
for i, cell in enumerate(merged_df['sents_sample']):
    result = frame_transformer.detect_frames_bulk(cell)
    frames = [frame.name for r in result for frame in r.frames]
    merged_df.at[merged_df.index[i], 'frames'] = frames  

# Save to CSV
merged_df.to_csv('eyewitness_abstracts_frame.csv')

In [132]:
print(merged_df.shape)
merged_df[:2]

(700, 6)


Unnamed: 0,id,text,embedding,sents_token,sents_sample,frames
0,03299a0bcddf7cec7bb8587c87af598a30cf6f24,Marceline Loridan-Ivens may be best known for ...,"[0.0129690608009696, 0.04015115275979042, -0.0...",[Marceline Loridan-Ivens may be best known for...,"[She was also a Holocaust survivor, who return...","[Perception_experience, Text_creation, Means, ..."
1,adb692013a60e1a7eb79ca4759e92fd5245a4f2d,"How do extraordinary experiences, especially d...","[-0.01149928942322731, 0.04745658487081528, -0...","[How do extraordinary experiences, especially ...",[He then conducts semi-structured interviews w...,"[Discussion, Participation, Purpose, Scrutiny,..."


In [146]:
merged_df['source'] = ""

merged_df['source'][:350, ] = "abstracts"
merged_df['source'][350:, ] = "eyewitness"

In [147]:
# Reorder columns
# Drop the column
col_to_move = merged_df.pop('source')

# Insert the column at the new position
merged_df.insert(1, 'source', col_to_move)

In [148]:
merged_df.iloc[349:352, :]

Unnamed: 0,id,source,text,embedding,sents_token,sents_sample,frames
349,310f6483ff18293a47e11024e3f9f5be0d2fa66c,abstracts,This paper reviews the literature on the long-...,"[0.02258148603141308, 0.06869726628065109, 0.0...",[This paper reviews the literature on the long...,"[Second, the aging process not only presents i...","[Process, Change_position_on_a_scale, Difficul..."
350,P.III.i. (Southwest Africa) No. 553,eyewitness,"Windhook, 1939 - 1942 When Mussolini’s switch ...","[-0.02214563637971878, 0.04101519659161568, -0...","[Windhook, 1939 - 1942 When Mussolini’s switch...","[There were (good) German amateur concerts, wh...","[Existence, Existence, People_by_origin, Socia..."
351,P.II.d. No. 8,eyewitness,Kiel. Pogrom: During the night of 10th Novembe...,"[-0.019050007686018944, 0.03850502893328667, 0...","[Kiel., Pogrom: During the night of 10th Novem...",[Mr Hans Lask gives the following interview fo...,"[Possession, Fields, Intentionally_create, Kin..."


In [149]:
# Save
# merged_df.to_csv('eyewitness_abstracts_frame_full.csv')