In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
data_path = '/content/drive/MyDrive/GNLP/HW1/'

Создание датасета на основе диалогов из фильмов STAR TREK.

In [3]:
!pip install DialogTag tensorflow keras -q

In [4]:
%%capture
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [5]:
import tensorflow as tf

In [6]:
!git clone https://github.com/varenc/star_trek_transcript_search


fatal: destination path 'star_trek_transcript_search' already exists and is not an empty directory.


In [7]:
%cd ./star_trek_transcript_search/scripts/

/content/star_trek_transcript_search/scripts


In [8]:
!ls

Discovery  DS9	Enterprise  Movies  NextGen  SNW  TAS  TOS  Voyager


In [9]:
import pandas as pd
import re
import os
import random

def parse_transcript(filepath):
    data = []

    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()

    # Correct bracket types and selectively remove text in parentheses
    text = text.replace('{', '[').replace('}', ']')
    text = re.sub(r'\([^\)]*\)', '', text)
    text = re.sub(r'\[OC\]|\[on monitor\]', '', text)
    #text = re.sub(r'\((?![^\(\)]*\[OC\])[^\(\)]*\)', '', text)
    #text = re.sub(r'\((?![^\(\)]*\[OC\])[^\(\)]*\)', '', text)

    episode_match = re.search(r'Episode Number: (\d+)', text)
    episode = int(episode_match.group(1)) if episode_match else None

    # Split the text into segments for scene names and dialogues
    segments = re.split(r'(\[[^\]]+\])', text)
    current_scene = "Unknown"

    for i in range(1, len(segments), 2):
        scene_name = segments[i].strip("[]")
        if not scene_name.endswith("OC") and not scene_name.endswith("on monitor"):
            current_scene = scene_name
        dialogues = segments[i + 1].split('\n')

        for dialogue in dialogues:
            if ':' in dialogue:
                character_dialogue = dialogue.split(':', 1)
                character = character_dialogue[0].strip()
                # Strip [OC] and [on monitor] from character names
                character = re.sub(r'\s*\[OC\]$', '', character, flags=re.IGNORECASE)
                character = re.sub(r'\s*\[on monitor\]$', '', character, flags=re.IGNORECASE)
                text = character_dialogue[1].strip()
                data.append([episode, current_scene, character, text])

    return data

# List of folders to process
folders = ["TOS", "TAS", "Movies"]  # Replace these with your actual folder paths

# Initialize an empty DataFrame to store all data
final_df = pd.DataFrame(columns=["Episode", "Scene", "Character", "Text"])

# Iterate over each folder
for folder in folders:
    current_folder_path = os.path.join(os.getcwd(), folder)  # Adjust if folders are not in the current directory
    text_files = sorted([f for f in os.listdir(current_folder_path) if f.endswith('.txt')])

    all_data = []

    # Process each file in the current folder
    for filename in text_files:
        filepath = os.path.join(current_folder_path, filename)
        if os.path.isfile(filepath):
            file_data = parse_transcript(filepath)
            all_data.extend(file_data)

    # Append data from the current folder to the final DataFrame
    final_df = pd.concat([final_df, pd.DataFrame(all_data, columns=["Episode", "Scene", "Character", "Text"])], ignore_index=True)

# Verify the output
print(final_df.head())


  Episode   Scene Character                                               Text
0       1  Bridge     SPOCK                                 Check the circuit.
1       1  Bridge     TYLER                                All operating, sir.
2       1  Bridge     SPOCK  It can't be the screen then. Definitely someth...
3       1  Bridge     TYLER                      It could be these meteorites.
4       1  Bridge       ONE  No, it's something else. There's still somethi...


In [10]:
final_df.shape

(40285, 4)

In [11]:
final_df[:25]

Unnamed: 0,Episode,Scene,Character,Text
0,1,Bridge,SPOCK,Check the circuit.
1,1,Bridge,TYLER,"All operating, sir."
2,1,Bridge,SPOCK,It can't be the screen then. Definitely someth...
3,1,Bridge,TYLER,It could be these meteorites.
4,1,Bridge,ONE,"No, it's something else. There's still somethi..."
5,1,Bridge,TYLER,"It's coming at the speed of light, collision c..."
6,1,Bridge,ONE,"Evasive manoeuvres, sir?"
7,1,Bridge,PIKE,Steady as we go.
8,1,Bridge,GARISON,"It's a radio wave, sir. We're passing through ..."
9,1,Bridge,PIKE,They were keyed to cause interference and attr...


In [12]:
final_df['Text'][2]

"It can't be the screen then. Definitely something out there, Captain, headed this way."

In [13]:
final_df

Unnamed: 0,Episode,Scene,Character,Text
0,1,Bridge,SPOCK,Check the circuit.
1,1,Bridge,TYLER,"All operating, sir."
2,1,Bridge,SPOCK,It can't be the screen then. Definitely someth...
3,1,Bridge,TYLER,It could be these meteorites.
4,1,Bridge,ONE,"No, it's something else. There's still somethi..."
...,...,...,...,...
40280,,Ba'ku village hayfield,DATA,Bye.
40281,,Ba'ku village hayfield,SOJEF,"Mister Data, I hope we'll see you again."
40282,,Ba'ku village hayfield,ARTIM,Data! ...Don't forget. You have to have a litt...
40283,,Ba'ku village hayfield,RIKER,Good advice.


Формирование датасета с репликами "неправильных" ответов на основе транскрипта сериала Доктор Хаус.

In [14]:
md_df = pd.read_csv(data_path+"house_md_s3.csv")

In [15]:
md_df

Unnamed: 0,name,line
0,Arlene,"Mark? Mark! Mark, please ask your father if h..."
1,Mark,Dad? [The boy keeps talking but we only hear ...
2,Mark,"Mom, I don't know what he wants, you ask him."
3,Arlene,"You want a burger? [She mouths ""okay"" but we ..."
4,Cuddy,"The guy drove his wheelchair into a pool, Hou..."
...,...,...
10942,House,"Yeah, I must be."
10943,Esteban,But you're not.
10944,House,I don't think I am. I think I'm okay.
10945,Esteban,What are you going to do?


In [16]:
pdf = final_df.copy()

In [17]:
pdf.shape

(40285, 4)

In [18]:
pdf['Character'].value_counts()

Unnamed: 0_level_0,count
Character,Unnamed: 1_level_1
KIRK,11614
SPOCK,5757
MCCOY,2908
SCOTT,1846
SULU,1202
...,...
MATLZ,1
HELM,1
KLINGON VOICES,1
YOUNG SPOCK,1


Подготовка датасета для Bi-encoder

In [19]:
# Initialize a list to store the data
spock_lines_data = []

# Variable to keep track of the current scene
current_scene = None

# Iterate through the DataFrame
for index, row in final_df.iterrows():
    # Check if the scene has changed
    if current_scene != row['Scene']:
        current_scene = row['Scene']
        scene_started_by_spock = row['Character'].lower() == 'spock'

    # Process lines for Spock
    if row['Character'].lower() == 'spock':
        if scene_started_by_spock or index == 0:  # Spock starts the scene or is the very first line
            spock_lines_data.append({"QUESTION": "", "ANSWER": row['Text'], "CONTEXT": "", "ANCHOR": ""})
        else:
            # Find the previous line by another character
            question = final_df.loc[index - 1, 'Text'] if index > 0 else ""

            # Collect up to 6 preceding lines for context
            start_index = max(0, index - 6)
            context = ' '.join(final_df.loc[start_index:index - 2, 'Text'])
            anchor = context + "[SEP]" + question

            spock_lines_data.append({"QUESTION": question, "ANSWER": row['Text'], "CONTEXT": context, "ANCHOR": anchor})
    else:
        # Update the flag if Spock is not the first to speak in the scene
        scene_started_by_spock = False

# Convert the list to a DataFrame
spock_lines_df = pd.DataFrame(spock_lines_data)
spock_lines_df["WRONG_ANSWER"] = md_df['line'].sample(n=len(spock_lines_df), replace=True).reset_index(drop=True)


# Display the first few rows of the Spock lines DataFrame
spock_lines_df.head()


Unnamed: 0,QUESTION,ANSWER,CONTEXT,ANCHOR,WRONG_ANSWER
0,,Check the circuit.,,,"And you didn't beliEve her, so you bribed a j..."
1,"All operating, sir.",It can't be the screen then. Definitely someth...,Check the circuit.,"Check the circuit.[SEP]All operating, sir.",(Smiles. Then she sees House through the wind...
2,We've no ships or Earth colonies that far out.,Their call letters check with a survey expedit...,"Steady as we go. It's a radio wave, sir. We're...","Steady as we go. It's a radio wave, sir. We're...",What exactly is your pRoblem with me?
3,It would take that long for a radio beam to tr...,Records show the Talos group has never been ex...,They were keyed to cause interference and attr...,They were keyed to cause interference and attr...,Test his DNA for Ragged Red Fiber.
4,If they survived the crash.,"We aren't going to go, to be certain?",We've no ships or Earth colonies that far out....,We've no ships or Earth colonies that far out....,Lung biopsies usually come back negative so b...


In [20]:
spock_lines_df.shape

(5757, 5)

In [21]:
spock_lines_df["CONTEXT"][4]

"We've no ships or Earth colonies that far out. Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately eighteen years ago. It would take that long for a radio beam to travel from there to here. Records show the Talos group has never been explored. Solar system similar to Earth, eleven planets. Number four seems to be Class M, oxygen atmosphere. Then they could still be alive, even after eighteen years."

In [22]:
spock_lines_df["ANCHOR"][4]

"We've no ships or Earth colonies that far out. Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately eighteen years ago. It would take that long for a radio beam to travel from there to here. Records show the Talos group has never been explored. Solar system similar to Earth, eleven planets. Number four seems to be Class M, oxygen atmosphere. Then they could still be alive, even after eighteen years.[SEP]If they survived the crash."

Определение Intent исходящих реплик

In [23]:
tf.keras.utils.disable_interactive_logging()

In [24]:
from dialog_tag import DialogTag

# Initialize DialogTag model
model = DialogTag('distilbert-base-uncased')

# Function to predict intent tag
def get_intent_tag(sentence):
    return model.predict_tag(sentence)

# Apply function to QUESTION column
spock_lines_df['INTENT_TAG'] = spock_lines_df['QUESTION'].apply(get_intent_tag)

distilbert-base-uncased found in cache. Loading model...


Some layers from the model checkpoint at /root/.dialog-tag/models/distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['dropout_59']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /root/.dialog-tag/models/distilbert-base-uncased and are newly initialized: ['dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [37]:
spock_lines_df

Unnamed: 0,QUESTION,ANSWER,CONTEXT,ANCHOR,WRONG_ANSWER,INTENT_TAG
0,,Check the circuit.,,,Does it matter?,Statement-non-opinion
1,"All operating, sir.",It can't be the screen then. Definitely someth...,Check the circuit.,"Check the circuit.[SEP]All operating, sir.",He was in the Navy not the Marines.,Statement-non-opinion
2,We've no ships or Earth colonies that far out.,Their call letters check with a survey expedit...,"Steady as we go. It's a radio wave, sir. We're...","Steady as we go. It's a radio wave, sir. We're...","This is not an act, I don't care if--",Statement-non-opinion
3,It would take that long for a radio beam to tr...,Records show the Talos group has never been ex...,They were keyed to cause interference and attr...,They were keyed to cause interference and attr...,[Puts his hand on her shoulder.] I'm proud of...,Statement-opinion
4,If they survived the crash.,"We aren't going to go, to be certain?",We've no ships or Earth colonies that far out....,We've no ships or Earth colonies that far out....,You're lousy with faces.,Statement-non-opinion
...,...,...,...,...,...,...
5752,,Alter circuit A.,,,The heart shows no sign of inflammation or...,Statement-non-opinion
5753,I'd give real money if he'd shut up.,Plate please.,...discuss the report on Phase two... Alter ci...,...discuss the report on Phase two... Alter ci...,"Hello yes, doctor House, its Esteban Hernande...",Statement-non-opinion
5754,,Key please.,,,"Got it, we're all idiots, what's your theory?",Statement-non-opinion
5755,Arrest those men!,Arrest yourself.,Out of the way! Out of the way! Mister Preside...,Out of the way! Out of the way! Mister Preside...,Your personal life is none of my business.,Quotation


In [38]:
spock_lines_df.to_pickle(data_path+"spock_lines.pkl")

In [39]:
pdf.to_pickle(data_path+"transcript.pkl")

Подготовка датасета для Reranker

In [40]:
spock_lines_reranker = spock_lines_df[['QUESTION', 'ANSWER', 'CONTEXT']].copy()
spock_lines_reranker['LABEL'] = 0

In [41]:
neg_lines = random.sample(list(spock_lines_df[spock_lines_df["QUESTION"] != ""]["QUESTION"]), 5207)
random.shuffle(spock_lines_df["WRONG_ANSWER"])

In [42]:
neg_lines_df = pd.DataFrame({"QUESTION": neg_lines, "ANSWER": spock_lines_df["WRONG_ANSWER"][:len(neg_lines)], "CONTEXT": spock_lines_df["CONTEXT"][:len(neg_lines)]})
neg_lines_df['LABEL'] = 1

In [43]:
spock_lines_reranker = pd.concat([spock_lines_reranker, neg_lines_df], ignore_index=True)
spock_lines_reranker = spock_lines_reranker.sample(frac=1).reset_index(drop=True)

In [44]:
spock_lines_reranker = spock_lines_reranker[spock_lines_reranker["QUESTION"] != ""]
spock_lines_reranker = spock_lines_reranker[spock_lines_reranker["ANSWER"] != ""]

In [45]:
spock_lines_reranker["CQA"] = spock_lines_reranker["CONTEXT"] + "[SEP]" + spock_lines_reranker["QUESTION"] + "[SEP]" + spock_lines_reranker["ANSWER"]

In [46]:
spock_lines_reranker['LABEL'].value_counts()

Unnamed: 0_level_0,count
LABEL,Unnamed: 1_level_1
1,5207
0,5207


In [47]:
spock_lines_reranker

Unnamed: 0,QUESTION,ANSWER,CONTEXT,LABEL,CQA
0,Scott here.,[voice rising] How can you Even support your ...,Get on with it. Why did you let him do it? I w...,1,Get on with it. Why did you let him do it? I w...
1,Speculation?,Because it's effective doesn't make it right.,What about medicine? Why no doctors? We haven'...,1,What about medicine? Why no doctors? We haven'...
2,Direct hit amidships by photon torpedo.,"In the beginning, but you can't tell me you d...","I'm sorry, Captain. Yes. You should've been a ...",1,"I'm sorry, Captain. Yes. You should've been a ..."
3,Too quickly. Bridge.,Freefall!,Aren't there certain mathematical problems whi...,0,Aren't there certain mathematical problems whi...
4,"All decks and divisions confirm, status red.",It's a good thing we didn't. Tightness in her...,The disease certainly is. How long do we have ...,1,The disease certainly is. How long do we have ...
...,...,...,...,...,...
10959,"Apparently it was not, Captain. Our tractor be...",And we are still increasing speed. Contact wit...,"Get ready to execute course change, Mister Sul...",0,"Get ready to execute course change, Mister Sul..."
10960,Why not?,Then go get yourself one and leave me alone.,"I'm weak, Captain, but not in difficulty. He m...",1,"I'm weak, Captain, but not in difficulty. He m..."
10961,The very reason for the existence of our stars...,"Captain, since I was included in the invitatio...","Speculation is unnecessary, Captain. The answe...",0,"Speculation is unnecessary, Captain. The answe..."
10962,"On audio, sir.",This is Spock.,"No more than an hour now, sir. Put all seconda...",0,"No more than an hour now, sir. Put all seconda..."


In [48]:
spock_lines_reranker.to_pickle(data_path+"spock_lines_reranker.pkl")

In [None]:
!pip freeze > /content/drive/MyDrive/GNLP/HW1/requirements.txt