# Install

In [None]:
!pip install minicons
!pip install datasets
!pip install mosestokenizer
!pip install sacremoses

In [None]:
from torch.utils.data import DataLoader
import numpy as np
import json
import pandas as pd

from minicons import scorer
import torch
from torch.utils.data import DataLoader
import numpy as np
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import BertTokenizerFast, BertModel, BertForMaskedLM
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
import math
import pandas as pd
from numpy import *
from minicons.utils import character_span
import os

# Function

In [None]:
'''
  This function reads the CSV data from the specified URL and returns it as a pandas DataFrame
'''
def read_csv(csv_link):
  url_data = csv_link.replace('/edit#gid=', '/export?format=csv&gid=')
  return pd.read_csv(url_data, index_col=False)

In [None]:
'''
  This function extracts a list of lp

  e.g.
    lst_pair_token_lp = [('It', -1.5366992950439453), ('is', -0.263397216796875), ('the', -0.9581403732299805), ('vote', -9.064363479614258)]
    lst_lp = [-1.5366992950439453, 0.263397216796875, 0.9581403732299805, -9.064363479614258]
'''
def extract_lst_lp(row):
  lst_pair_token_lp = row['sentence_tokens_lp']
  lst_lp = []
  for i in range(len(lst_pair_token_lp)):
    pair_token_lp = lst_pair_token_lp[i]
    lst_lp.append(pair_token_lp[1])
  return lst_lp
  
'''
  This function computes lp for each token of each sentence
  e.g.
    Input: 'It is the vote'
    Output: [('It', -1.5366992950439453), ('is', -0.263397216796875), ('the', -0.9581403732299805), ('vote', -9.064363479614258)]
'''
def compute_lst_pair_tokens_lp(row, model, language, model_type):
  if (language == 'fr' and model_type == 'bert'):
    return model.token_score(row['sentence'], surprisal = False)[0][1:]
  else:
    return model.token_score(row['sentence'], surprisal = False)[0]

'''
  This function computes surprisal for target token(s)
'''
def compute_surprisal_partiel(row, name, model, language, model_type):
  sentence_token_lp = row['sentence_tokens_lp']

  # extract lp for sentence
  lst_lp_sentece = row['lst_lp']

  # compute target lp
  if (row[name] == 'na'):
    return 0
  target_token_lp = model.token_score(row[name], surprisal = False)[0]
  if (language == 'fr' and model_type == 'bert'):
    target_token_lp = target_token_lp[1:]
  # extract targert token
  target = [tup[0] for tup in target_token_lp]

  # create a list of tuples with consecutive index numbers
  index_token_sentence = list(enumerate([word[0] for word in sentence_token_lp]))
  # initialize variables for the start and end index of the target span
  start_index = None
  end_index = None

  # iterate through the index tuples to find the start and end indices of the target span
  for i in range(len(index_token_sentence)):
    # check if the current word matches the start of the target span
    if index_token_sentence[i][1] == target[0]:
      # check if the subsequent words match the rest of the target span
      if (i+len(target) > len(index_token_sentence)):
        return None
      if [index_token_sentence[j][1] for j in range(i, i+len(target))] == target:
        start_index = i
        end_index = i+len(target)-1
        break

  # when no match
  if (start_index == None or end_index == None):
    target = row[name].split()
    # if the length of target token is one
    if (len(target)==1):
      for i in range(len(index_token_sentence)):
        if index_token_sentence[i][1] == target[0]:
          return -lst_lp_sentece[i]
        elif(target[0] == '#'):
          return 0
    else:
      return None
  # when it is well matched, return mean surprisal of targed tokens
  else:
    target_lp = lst_lp_sentece[start_index:end_index+1]
    return -mean(target_lp)

In [None]:
def main(df, language, model_type, position, device, path_output_file):
  # load language model
  if (language == 'en'):
    if (model_type == 'bert'):
      model = scorer.MaskedLMScorer('bert-base-cased', device) 
    elif (model_type == 'gpt'):
      model = scorer.IncrementalLMScorer('gpt2', device) 
  elif (language == 'fr'):
    if (model_type == 'bert'):
      model = scorer.MaskedLMScorer('flaubert/flaubert_base_cased', device)
    elif (model_type == 'gpt'):
      model = scorer.IncrementalLMScorer('asi/gpt-fr-cased-small', device)

  # compute lp for each token of sentence 
  df['sentence_tokens_lp'] = df.apply(lambda x: compute_lst_pair_tokens_lp(x, model, language, model_type), axis=1)
  # extract a list of tokens and of lp of token
  df['lst_lp'] = df.apply(lambda x: extract_lst_lp(x), axis=1)
  # compute mean lp for each sentence and store results in df
  df['mean_lp'] = model.sequence_score(df['sentence'], reduction = lambda x: -x.mean(0).item())

  # compute mean surprisal for each zone if considering 'zone'
  if (position == 'zone'):
    df['surprisal_zone1'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone1', model, language, model_type), axis=1)
    df['surprisal_zone2'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone2', model, language, model_type), axis=1)
    df['surprisal_zone3'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone3', model, language, model_type), axis=1)
    df['surprisal_zone4'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone4', model, language, model_type), axis=1)
    df['surprisal_zone5'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone5', model, language, model_type), axis=1)
    df['surprisal_zone6'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone6', model, language, model_type), axis=1)
    df['surprisal_zone7'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone7', model, language, model_type), axis=1)
    df['surprisal_zone8'] = df.apply(lambda x: compute_surprisal_partiel(x, 'zone8', model, language, model_type), axis=1)

  df.to_csv(path_output_file, index=False)

In [None]:
def run(path_file_dict, DIR, device='cuda'):
  position = 'not zone'
  for path_file in path_file_dict.keys():
    df_exp = read_csv(path_file)
    nb_exp = path_file_dict[path_file]
    # The first three experiments are in English
    if (nb_exp <4):
      language = 'en'
    # The rest of the experiment is in French
    else:
      language = 'fr'
      # Experiments 7 and 8 compute the surprisal of each zone
      if (nb_exp >=7):
        position = 'zone'
    # store all results
    main(df_exp, language, 'bert', position, device, DIR+str(nb_exp)+'-bert.csv')
    main(df_exp, language, 'gpt', position, device, DIR+str(nb_exp)+'-gpt.csv')

# Run

In [None]:
DIR = ''

In [None]:
'''
  Part1 
'''
# Read materials
df_en = read_csv('')
df_fr = read_csv('')

# Compute mean lp
main(df_en_linear, 'en', 'bert', 'not zone', 'cuda', DIR+'gradient_en_bert.csv')
main(df_en_linear, 'en', 'gpt', 'not zone', 'cuda', DIR+'gradient_en_gpt.csv')
main(df_fr_linear, 'fr', 'bert', 'not zone', 'cuda', DIR+'gradient_fr_bert.csv')
main(df_fr_linear, 'fr', 'gpt', 'not zone', 'cuda', DIR+'gradient_en_gpt.csv')

In [None]:
'''
  Part2 and Part3
'''
# Define dict of materals
path_file_dict =  {'':1, '':2, '':3, '':4, '':5, '':6, '':7, '':8}

# Compute mean lp
run(path_file_dict)