In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import zipfile
import os
import pandas as pd
import ast
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
zip_path = '/content/drive/MyDrive/Data/movie_corpus.zip'

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/movie_corpus')

# Path to the extracted files
extracted_path = '/content/movie_corpus'
print(os.listdir(extracted_path))

['movie_titles_metadata.txt', 'movie_characters_metadata.txt', '.DS_Store', 'movie_lines.txt', 'raw_script_urls.txt', 'chameleons.pdf', 'movie_conversations.txt', 'README.txt']


In [7]:
# Load movie lines
lines_path = os.path.join(extracted_path, 'movie_lines.txt')
conversations_path = os.path.join(extracted_path, 'movie_conversations.txt')

# Read movie_lines.txt
# 'sep' is the seperator by which data is seperated in our txt file
# 'header= None' means that there is no header row in the txt file by default
# 'names' list gives us the option to specify headers on our own
lines = pd.read_csv(lines_path, sep='\+\+\+\$\+\+\+', header=None, engine='python', names=['lineID', 'characterID', 'movieID', 'character', 'text'], encoding='latin-1')

# Read movie_conversations.txt
conversations = pd.read_csv(conversations_path, sep='\+\+\+\$\+\+\+', header=None, engine='python', names=['character1ID', 'character2ID', 'movieID', 'utteranceIDs'], encoding='latin-1')

In [8]:
# Create a dictionary to map each line's ID to its text
id2line = {line.lineID: line.text for line in lines.itertuples()}

# This will be the sample output:
# {
#    1: 'Hello, how are you?',
#    2: "I'm fine, thank you.",
#    3: 'Good morning!',
#    4: 'Hi there.'
# }

In [11]:
# Extract conversations
# This will be a list of lists, where each inner list represents a conversation
conversations_data = []

for conv in conversations.itertuples():
    # Convert the string representation of list to a list of strings
    #utteranceID is the column in dataframe that is a list of string of line IDs
    utterance_ids = ast.literal_eval(conv.utteranceIDs)
    # Get the text corresponding to each line ID
    conv_texts = [id2line.get(uid, '') for uid in utterance_ids]
    # Append the conversation to the list
    conversations_data.append(conv_texts)


In [13]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
     # Replace contractions and common abbreviations with full forms
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    # Remove leading and trailing spaces
    return text.strip()

# Apply preprocessing to each line in the conversations
preprocessed_conversations = []
for conv in conversations_data:
    # Preprocess each line in the conversation
    preprocessed_conversations.append([preprocess_text(line) for line in conv])

In [14]:

input_texts = []

target_texts = []
#Iterating through each conversation that is preprocessed
for conv in preprocessed_conversations:
    #Iterating through each line in the conversation
    for i in range(len(conv) - 1):
        #Appending the input and target texts
        input_texts.append(conv[i]) #input text
        target_texts.append(conv[i + 1])  #target text

In [None]:

# Tokenize the input and target texts
tokenizer = Tokenizer()
#Fit the tokenizer on the input and target texts
#This will create the vocabulary of words used in the texts
#This will split the words
#This will also assign a unique integer to each word
#This will also count the frequency of each word
# 1: i (i occured most for example) 2: am (am occured 2nd most for example)
tokenizer.fit_on_texts(input_texts + target_texts)

# Convert the input texts to sequences of integers
# 'hello how are you' becomes [6, 7, 8, 5]
input_sequences = tokenizer.texts_to_sequences(input_texts)

# Convert the target texts to sequences of integers
# 'i am fine thank you' becomes [1, 2, 3, 4, 5].
target_sequences = tokenizer.texts_to_sequences(target_texts)


#finding max of both sequences
max_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
#pad zeros in small length input sequences at the end
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
#pad zeros in small length target sequences at the end
target_sequences = pad_sequences(target_sequences, maxlen=max_length, padding='post')


In [None]:
#From here on split data into training and testing data
# chose network of the choice LSTM, Transformer