In [1]:
import pandas as pd
import os
import spacy 
nlp = spacy.load("en_core_web_sm")
from tqdm import tqdm
from collections import defaultdict 
#Import des tokenizers - sent_tokenize pour les phrases et word_tokenize pour les mots avec nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics
import numpy as np
from numpy.linalg import norm
import re

import torch
print(torch.cuda.is_available())

#Personalized - tested and no bug functions
from Story_characters import Characters
from Story_conflict import Conflict
from Story_themes import Themes
from Story_chronology import Chronology
from Story_causality import Causality

True


In [2]:
def data_clean(text):
    
    text = text.replace('@', '')
    text = text.replace('#', '')
    text = re.sub(r'http\S+', '', text)
    return text

In [3]:
###########################################################################
df_train = pd.read_csv("data/Train_Post_StoryPara_V1.csv")
df_train['PostText'] = df_train['PostText'].apply(data_clean)
df_train['PostText'].replace('', np.nan, inplace=True)
df_posts_train = df_train.dropna(subset=['PostText'])

###########################################################################
df_valid = pd.read_csv("data/Valid_Post_StoryPara_V1.csv")
df_valid['PostText'] = df_valid['PostText'].apply(data_clean)
df_valid['PostText'].replace('', np.nan, inplace=True)
df_posts_valid = df_valid.dropna(subset=['PostText'])

###########################################################################
df_test = pd.read_csv("data/Test_Post_StoryPara_V1.csv")
df_test['PostText'] = df_test['PostText'].apply(data_clean)
df_test['PostText'].replace('', np.nan, inplace=True)
df_test['PostText'].replace(' ', np.nan, inplace=True)
df_posts_test = df_test.dropna(subset=['PostText'])

In [4]:
id_posts_train = df_posts_train["PostSysID"]
train_list = list(df_posts_train["PostText"])
id_posts_valid = df_posts_valid["PostSysID"]
valid_list = list(df_posts_valid["PostText"])
id_posts_test = df_posts_test["PostSysID"]
test_list = list(df_posts_test["PostText"])

In [5]:
print(len(train_list))
print(len(valid_list))
print(len(test_list))

7998
4998
3919


## Personnages

In [6]:
def compute_characters(text_list, dataframe, name_save = 'Test_Story_withVariable'): 
    
    charac_func = Characters()
    #For pronouns
    first_sing, first_plur, second, third_sing, third_plur = [], [], [], [], []
    #Named_entities
    bin_ent = []
    #most_used_pron : Categorial
    cat_most_used = []
    #Proportion of sentences with 2 characters
    two_charac = []
    # Entity as a subject
    EN_subj = []
    
    for i, text in enumerate(text_list):
        bin_dict = charac_func.pronouns(text)
        first_sing.append(bin_dict["first_sing"])
        first_plur.append(bin_dict["first_plur"])
        second.append(bin_dict["second"])
        third_sing.append(bin_dict["third_sing"])
        third_plur.append(bin_dict["third_plur"])
        
        bin_ent.append(charac_func.named_entities(text))
        cat_most_used.append(charac_func.most_used_pron(text))
        two_charac.append(charac_func.prop_same_sent(text))
        EN_subj.append(charac_func.EN_as_subj(text))
        
    # Appending all list to dataframe and save
    dataframe["first_person_sing"] = first_sing
    dataframe["first_person_plur"] = first_plur
    dataframe["second_person"] = second
    dataframe["third_person_sing"] = third_sing
    dataframe["third_person_plur"] = third_plur
    
    dataframe["Binary_Entity"] = bin_ent
    dataframe["Most_common_pronoun"] = cat_most_used
    dataframe["Ratio_sentences_2_persos"] = two_charac
    dataframe["EN_sent_subj"] = EN_subj
    
    #dataframe.to_csv(f'data/{name_save}.csv', index=False)

## Themes

In [7]:
def compute_themes(train_list, text_list, dataframe, name_save = 'Test_Story_withVariable'):
    
    #See the Data_Processing file for the parameters selection
    theme_func = Themes(train_list, num_k=15, ban_list = ['DET', 'PUNCT', 'AUX'], alpha=0.01, beta=0.01)
    diversity_, homogeneity_, consistence_ = [], [], []
    for text in text_list:
        diversity_.append(theme_func.diversity(text))
        homogeneity_.append(theme_func.homogeneity(text))
        consistence_.append(theme_func.consistence(text))
        
    dataframe["Intrigue_Diversity"] = diversity_
    dataframe["Intrigue_Sentence_Homogeneity"] = homogeneity_
    dataframe["Intrigue_Consistence"] = consistence_
    
    #dataframe.to_csv(f'data/{name_save}.csv', index=False)

## Conflict

In [8]:
def compute_conflict(text_list, dataframe, name_save = 'Test_Story_withVariable'):
    
    conflict_func = Conflict(increase_ratio=2, n_window=1)
    conflictual_envent, conflict_2_persos, conflict_increase, intrigue_change_sent = [], [], [], []
    for i, text in enumerate(text_list):
        intrigue_change_sent.append(conflict_func.change_sent_status(text))
        conflictual_envent.append(conflict_func.conflict_event(text))
        conflict_2_persos.append(conflict_func.sent_characters(text))
        conflict_increase.append(conflict_func.increase_sent(text))
        
    dataframe["Intrigue_Prop_Change_Sent"] = intrigue_change_sent
    dataframe["Conflictual_Event"] = conflictual_envent
    dataframe["Conflict_two_persos"] = conflict_2_persos
    dataframe["Climax_increase"] = conflict_increase
    
    #dataframe.to_csv(f'data/{name_save}.csv', index=False)

## Causality

In [9]:
def compute_causality(text_list, dataframe, name_save = 'Test_Story_withVariable'):
    
    causal_func = Causality()
    causal_coherence_, total_length_causal, causal_intra_sent = [], [], []
    for text in text_list:
        causal_coherence_.append(causal_func.causal_chorence(text))
        total_length_causal.append(causal_func.causal_length(text))
        causal_intra_sent.append(causal_func.causal_subord(text))
        
    dataframe["Causal_Coherence"] = causal_coherence_
    dataframe["Longest_Causal_Sequence"] = total_length_causal
    dataframe["IntraSentence_Causality"] = causal_intra_sent
    
    #dataframe.to_csv(f'data/{name_save}.csv', index=False)

## Chronologie

In [10]:
def compute_chrono(text_list, dataframe, name_save = 'Test_Story_withVariable'):

    chrono_func = Chronology()
    date_presence, porp_time_change, logic_temp_order, logic_tense_order = [], [], [], []
    for text in text_list:
        date_presence.append(chrono_func.bin_date(text))
        prop_time, logic = chrono_func.special_chrono(text)
        porp_time_change.append(prop_time)
        logic_temp_order.append(logic)
        logic_tense_order.append(chrono_func.tense(text))
        
    dataframe["Presence_Dates"] = date_presence
    dataframe["Logic_order_temporality"] = logic_temp_order
    dataframe["Prop_temp_use"] = porp_time_change
    dataframe["Logic_order_conjuguation"] = logic_tense_order
    
    #dataframe.to_csv(f'data/{name_save}.csv', index=False)

# Compute

In [None]:
#Start by computing and saving Characters Variables
compute_characters(valid_list, df_posts_valid, name_save = 'Valid_Story_withVariable')
compute_characters(test_list, df_posts_test, name_save = 'Test_Story_withVariable')

In [11]:
#Compute theme and Intrigue in Story
#new_valid = pd.read_csv("data/Valid_Story_withVariable.csv")
#new_test = pd.read_csv("data/Test_Story_withVariable.csv")
#compute_themes(train_list, valid_list, new_valid, name_save = 'Valid_Story_withVariable')
#compute_themes(train_list, test_list, new_test, name_save = 'Test_Story_withVariable')
compute_themes(train_list, valid_list, df_posts_valid, name_save = 'Valid_Story_withVariable')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Intrigue_Diversity"] = diversity_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Intrigue_Sentence_Homogeneity"] = homogeneity_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Intrigue_Consistence"] = consistence_


In [None]:
#Compute Presence of conflict
#new_valid = pd.read_csv("data/Valid_Story_withVariable.csv")
#new_test = pd.read_csv("data/Test_Story_withVariable.csv")
#compute_conflict(valid_list, new_valid, name_save = 'Valid_Story_withVariable')
#compute_conflict(test_list, new_test, name_save = 'Test_Story_withVariable')
compute_conflict(valid_list, df_posts_valid, name_save = 'Valid_Story_withVariable')

In [13]:
#Compute causal relations of story
new_valid = pd.read_csv("data/Valid_Story_withVariable.csv")
new_test = pd.read_csv("data/Test_Story_withVariable.csv")
compute_causality(valid_list, new_valid, name_save = 'Valid_Story_withVariable')
compute_causality(test_list, new_test, name_save = 'Test_Story_withVariable')

Downloading config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
#Compute temporality and space
#new_valid = pd.read_csv("data/Valid_Story_withVariable.csv")
#new_test = pd.read_csv("data/Test_Story_withVariable.csv")
#compute_chrono(valid_list, new_valid, name_save = 'Valid_Story_withVariable')
#compute_chrono(test_list, new_test, name_save = 'Test_Story_withVariable')
compute_chrono(valid_list, df_posts_valid, name_save = 'Valid_Story_withVariable')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Presence_Dates"] = date_presence
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Logic_order_temporality"] = logic_temp_order
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["Prop_temp_use"] = porp_time_change
A value is trying to be set on a copy of a slice from a Dat