In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA


In [6]:
data = pd.read_csv('/kaggle/input/dataset0/data.csv')
data_specificity = pd.read_json('/kaggle/input/data-specificity/data-specificity.json')

 Criteria for defining a personal event memory by Pillemer ( 1998 ) :  
*  (a) present a specific event that took place at a particular time and place, rather than a summary event or extended series of events.
* (b) contain a detailed account of the rememberer's own personal circumstances at the time of the event. 
* (c) evoke sensory images or bodily sensations that contribute to the feeling of "re-experiencing" or "reliving" the event.
* (d) link its details and images to a particular moment or moments of phenomenal experience. 
* (e) be believed to be a truthful representation of what actually transpired.

## 1. Specificity

In [7]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import spacy

# Tokenization
def tokenize_text(text):
    return sent_tokenize(text)

# Part-of-Speech (POS) Tagging
def pos_tagging(text):
    tokens = word_tokenize(text)
    return pos_tag(tokens)


#  Named Entity Recognition (SpaCy)
def named_entity_recognition_spacy(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    named_entities = [ent.text for ent in doc.ents]
    return named_entities

# Event Extraction
def extract_events(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    events = []
    current_event = []
    for tag in tagged:
        if tag[1].startswith('VB'):
            current_event.append(tag[0])
        elif current_event:
            events.append(' '.join(current_event))
            current_event = []
    if current_event:
        events.append(' '.join(current_event))
    return events



caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [8]:
print(data.iloc[0]['Story'])

the loss of my father will forever leave an indelible mark on my heart it also provided me with an unwavering strength. It shaped me into a more resilient and compassionate person capable of facing adversity with newfound determination. I carry my father's memory with me drawing inspiration from his life and the lessons he imparted. Through this turning point I have learned that strength can emerge from even the darkest moments and I am committed to living a life that honors his legacy.


In [9]:
def specificity_spacy(story):
    sentences = tokenize_text(story)
    tagged_sentences = [pos_tagging(sentence) for sentence in sentences]
    named_entities = [named_entity_recognition_spacy(sentence) for sentence in sentences]
    print( "named entities", named_entities)
    events = [extract_events(sentence) for sentence in sentences]
    print("events", events)
    total_named_entities = sum(len(ne_list) for ne_list in named_entities)
    total_events = sum(len(event_list) for event_list in events)
    total_sentences = len(sentences)
    
    threshold = total_sentences // 5
    
    # Check if total_named_entities and total_events are greater than the threshold
    is_specific_story = ( total_named_entities > threshold ) & ( total_events > threshold )
    
    return is_specific_story

Testing the specifity_ne_spacy function :

In [10]:


# Example usage
story_text = "Once upon a time, in a faraway land, there lived a brave knight named Sir Lancelot..."
is_specific_story = specificity_spacy(story_text)
print("Is the story specific?", is_specific_story)


named entities [['Lancelot']]
events [['lived', 'named']]
Is the story specific? True


In [11]:


# Example usage
story_text = 'The annual science fair showcased impressive projects from young innovators. Students presented their research findings on various topics. A robot that can solve complex puzzles was a highlight of the event. Researchers discussed cutting-edge advancements in artificial intelligence. The fair concluded with an awards ceremony honoring the top projects.'
is_specific_story = specificity_spacy(story_text)
print("Is the story specific?", is_specific_story)


named entities [[], [], [], [], []]
events [['showcased'], ['presented'], ['solve', 'was'], ['discussed'], ['concluded', 'honoring']]
Is the story specific? False


In [12]:


# Example usage
story_text = 'The annual science called Gitex fair showcased impressive projects from young innovators. Students presented their research findings on various topics. A robot that can solve complex puzzles was a highlight of the event. Researchers discussed cutting-edge advancements in artificial intelligence. The fair concluded with an awards ceremony honoring the top projects.'
is_specific_story = specificity_spacy(story_text)
print("Is the story specific?", is_specific_story)


named entities [['Gitex'], [], [], [], []]
events [['called', 'showcased'], ['presented'], ['solve', 'was'], ['discussed'], ['concluded', 'honoring']]
Is the story specific? False


In [16]:
def evaluate_specificity_dataframe(data_frame):
    correct_predictions = 0
    total_stories = len(data_frame)

    for index, row in data_frame.iterrows():
        story = row['story']
        ground_truth_label = row['is_specific']
        
        predicted_label = specificity_spacy(story)
        
        # Convert boolean to string
        predicted_label_str = "yes" if predicted_label else "no"
        
        if predicted_label_str == ground_truth_label:
            correct_predictions += 1

    accuracy = correct_predictions / total_stories

    return accuracy

accuracy = evaluate_specificity_dataframe(data_specificity)
print(f"Accuracy: {accuracy:.2f}")


named entities [['Gitex'], [], [], [], []]
events [['called', 'showcased'], ['presented'], ['solve', 'was'], ['discussed'], ['concluded', 'honoring']]
named entities [[], [], [], [], []]
events [['set'], ['turned'], ['flew'], ['descended'], ['was captivating']]
named entities [[], [], [], [], []]
events [['walked'], ['honked', 'shouted'], ['took'], ['gave'], ['cheered']]
named entities [[], [], [], []]
events [['embarked'], ['sought', 'save'], ['encountered', 'forged'], ['were']]
named entities [['midnight'], ['Ali'], [], [], []]
events [['struck'], ['crossed'], ['walked'], ['whispered'], ['emerged']]
Accuracy: 0.60


In [17]:
def evaluate_precision(data_frame):
    true_positives = 0
    false_positives = 0

    for index, row in data_frame.iterrows():
        story = row['story']
        ground_truth_label = row['is_specific']

        predicted_label = specificity_spacy(story)
        
        if predicted_label and ground_truth_label == "yes":
            true_positives += 1
        elif predicted_label and ground_truth_label == "no":
            false_positives += 1

    precision = true_positives / (true_positives + false_positives)
    return precision

precision = evaluate_precision(data_specificity)
print(f"Precision: {precision:.2f}")


named entities [['Gitex'], [], [], [], []]
events [['called', 'showcased'], ['presented'], ['solve', 'was'], ['discussed'], ['concluded', 'honoring']]
named entities [[], [], [], [], []]
events [['set'], ['turned'], ['flew'], ['descended'], ['was captivating']]
named entities [[], [], [], [], []]
events [['walked'], ['honked', 'shouted'], ['took'], ['gave'], ['cheered']]
named entities [[], [], [], []]
events [['embarked'], ['sought', 'save'], ['encountered', 'forged'], ['were']]
named entities [['midnight'], ['Ali'], [], [], []]
events [['struck'], ['crossed'], ['walked'], ['whispered'], ['emerged']]
Precision: 1.00


In [18]:
def evaluate_recall(data_frame):
    true_positives = 0
    false_negatives = 0

    for index, row in data_frame.iterrows():
        story = row['story']
        ground_truth_label = row['is_specific']

        predicted_label = specificity_spacy(story)
        
        if predicted_label and ground_truth_label == "yes":
            true_positives += 1
        elif not predicted_label and ground_truth_label == "yes":
            false_negatives += 1

    recall = true_positives / (true_positives + false_negatives)
    return recall

recall = evaluate_recall(data_specificity)
print(f"Recall: {recall:.2f}")


named entities [['Gitex'], [], [], [], []]
events [['called', 'showcased'], ['presented'], ['solve', 'was'], ['discussed'], ['concluded', 'honoring']]
named entities [[], [], [], [], []]
events [['set'], ['turned'], ['flew'], ['descended'], ['was captivating']]
named entities [[], [], [], [], []]
events [['walked'], ['honked', 'shouted'], ['took'], ['gave'], ['cheered']]
named entities [[], [], [], []]
events [['embarked'], ['sought', 'save'], ['encountered', 'forged'], ['were']]
named entities [['midnight'], ['Ali'], [], [], []]
events [['struck'], ['crossed'], ['walked'], ['whispered'], ['emerged']]
Recall: 0.33


Here's what recall reflects:

Completeness: Recall indicates how well your model is capturing all instances of the positive class (in your case, the specific stories). A higher recall means that your model is successfully identifying most of the specific stories present in the dataset.

Missed Positive Cases: A low recall value suggests that your model is missing a significant portion of the positive cases. This could mean that the model is not sensitive enough to detect the specific stories, leading to false negatives.

Trade-off with Precision: Recall is often in conflict with precision. A high recall could lead to more false positives (cases incorrectly classified as positive), as the model may be more inclusive in classifying instances as positive. Balancing recall and precision is important depending on your use case.

# 2.Personal Context 

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim import corpora, models

def evaluate_personal_context(text):
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)

    # Sentiment Analysis
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = [sia.polarity_scores(sentence)["compound"] for sentence in sentences]

    # Topic Modeling
    tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
    dictionary = corpora.Dictionary(tokenized_sentences)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_sentences]
    lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary)

    # Extract most significant topics
    topics = [lda_model.get_document_topics(doc) for doc in corpus]
    most_significant_topics = [max(topic, key=lambda x: x[1]) for topic in topics]

    # Get actual topics
    actual_topics = [lda_model.print_topic(topic[0]) for topic in most_significant_topics]

    # Return sentiment scores and actual topics
    return sentiment_scores, actual_topics


In the context of topic modeling with LDA, the weights assigned to each word in a topic represent the importance or prevalence of that word within the topic. In the output you provided

Understanding the significance of individual words and their weights within a topic can help provide insights into the key themes and subjects present in the text.

In [None]:
def test_personal_context(text):
    sentiment_scores, actual_topics = evaluate_personal_context(text)
    print("Sentiment Scores:", sentiment_scores)
    print()
    print("Actual Topics:")
    for topic in actual_topics:
        print(topic)

In [None]:
test_personal_context(data.iloc[0]['Story'])

In [None]:
test_personal_context("i lost my best friend and im so happy")

the word "my" has the highest weight of 0.102 for the extracted topic.

A high weight for the word "my" suggests that it is a significant term within the topic identified by the LDA model. This means that the word "my" occurs frequently and carries substantial importance within the text when discussing the particular topic associated with that topic index.

In this case, it indicates that personal ownership or possession, likely related to the topic of loss and enduring emotional impact, plays a prominent role in the text. The word "my" may be indicating a personal connection or the speaker's individual perspective in relation to the topic being discussed.

## 3.Sensory details 

In [None]:
import nltk

def analyze_sensory_details(text):
    sentences = nltk.sent_tokenize(text)

    sensory_keywords = {
        'sight': ['see', 'look', 'watch'],
        'sound': ['hear', 'listen', 'sound'],
        'smell': ['smell', 'scent', 'aroma'],
        'taste': ['taste', 'flavor'],
        'touch': ['feel', 'touch', 'texture']
    }

    sensory_details = []
    for sentence in sentences:
        lower_sentence = sentence.lower()
        for sense, keywords in sensory_keywords.items():
            for keyword in keywords:
                if keyword in lower_sentence:
                    sensory_details.append({'sense': sense, 'sentence': sentence})
                    break

    return sensory_details


In [None]:
def test_sensory_details(text):
    sensory_details = analyze_sensory_details(text)
    print("Sensory Details:")
    for detail in sensory_details:
        print(f"{detail['sense']}: {detail['sentence']}")

In [None]:
test_sensory_details(data.iloc[2]['Story'])

## 4.Phenominal experience 

using emotional analysis 

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

def analyze_emotional_tone(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

speech = "The loss of my father will forever leave an indelible mark on my heart. But im so happy that he died"

emotion_scores = analyze_emotional_tone(speech)
print("Emotion Scores:", emotion_scores)


## Truthfulness

## Metrics 

- Accuracy: This metric measures the ratio of correctly predicted instances to the total instances in the dataset. It's a common metric for overall performance.
- Precision: Precision calculates the ratio of true positive predictions to the total predicted positive instances. It measures how accurate the positive predictions are.
- Recall: Also known as sensitivity or true positive rate, recall measures the ratio of true positive predictions to the total actual positive instances. It gauges how well the model captures all the positive instances.

### Specificity

In [None]:
def is_specific(story):
    

In [None]:
def specificity_ne1(story, ground_truth_named_entities, ground_truth_events):
    sentences = tokenize_text(story)
    
    tagged_sentences = [pos_tagging(sentence) for sentence in sentences]
    named_entities = [named_entity_recognition(sentence) for sentence in sentences]
    events = [extract_events(sentence) for sentence in sentences]
    
    specific_named_entities = []
    specific_events = []

    for ne_list, event_list in zip(named_entities, events):
        for named_entity in ne_list:
            if is_specific(named_entity):
                specific_named_entities.append(named_entity)

        for event in event_list:
            if is_specific(event):
                specific_events.append(event)

    # Calculate metrics
    true_positive_named_entities = len(set(specific_named_entities) & set(ground_truth_named_entities))
    false_positive_named_entities = len(set(specific_named_entities) - set(ground_truth_named_entities))
    false_negative_named_entities = len(set(ground_truth_named_entities) - set(specific_named_entities))
    
    true_positive_events = len(set(specific_events) & set(ground_truth_events))
    false_positive_events = len(set(specific_events) - set(ground_truth_events))
    false_negative_events = len(set(ground_truth_events) - set(specific_events))
    
    accuracy = (true_positive_named_entities + true_positive_events) / (len(ground_truth_named_entities) + len(ground_truth_events))
    
    precision_named_entities = true_positive_named_entities / (true_positive_named_entities + false_positive_named_entities)
    recall_named_entities = true_positive_named_entities / (true_positive_named_entities + false_negative_named_entities)
    
    precision_events = true_positive_events / (true_positive_events + false_positive_events)
    recall_events = true_positive_events / (true_positive_events + false_negative_events)

    return {
        'accuracy': accuracy,
        'precision_named_entities': precision_named_entities,
        'recall_named_entities': recall_named_entities,
        'precision_events': precision_events,
        'recall_events': recall_events
    }

# Example usage
story_text = "Once upon a time, in a faraway land, there lived a brave knight named Sir Lancelot..."
ground_truth_named_entities = ["Sir Lancelot"]
ground_truth_events = ["lived"]
metrics = specificity_ne1(story_text, ground_truth_named_entities, ground_truth_events)
print(metrics)
