In [1]:
import pandas as pd
import numpy as np
import json

# Step 1: Download Datasets

Download the following datasets
- Movie Dialogue Corpus: https://www.kaggle.com/datasets/Cornell-University/movie-dialog-corpus
- Chatbot Dataset Topical Chat: https://www.kaggle.com/datasets/arnavsharmaas/chatbot-dataset-topical-chat
- Human Conversation Training Data: https://www.kaggle.com/datasets/projjal1/human-conversation-training-data
- Conversation Meetings: https://www.kaggle.com/datasets/gogogaurav95/conversation-meetings
- Conversation JSON: https://www.kaggle.com/datasets/vaibhavgeek/conversation-json

Create a new folder within the same directory of this notebook called raw_data and place the downloaded data in that folder

In [2]:
def create_conversation_rows(conversations):
    '''
    Reshapes a list of dialogue between two individuals/entites into 1 or more rows of dialogue.
    Each row consists of response and the previous 6 responses which serves as context for the DialoGPT
    chatbot
    '''
    rows = []
    if len(conversations) > 7:
        for i in range(7, len(conversations)):
            rows.append(conversations[i-7:i+1][::-1])
    return rows

In [3]:
def parse_conversation_json():
    '''
    Parses the Conversation JSON file and returns a list of conversation rows
    '''
    with open('raw_data/conversation.json') as f:
        data = json.load(f)
    
    rows = []
    for conversation in data['conversations']:
        newrows = create_conversation_rows(conversation)
        for row in newrows:
            rows.append(row)
    return rows

In [4]:
def parse_conversation_txt(filepath):
    '''
    Parses the text files from the following datasets:
      - Human Conversation Training Dataset
      - Conversation Meetings Dataset
    
    Returns a list of conversational rows
    '''
    with open(filepath, encoding='utf8') as f:
        conversation = []
        data = f.read().split('\n')
        for line in data:
            if line != '':
                conversation.append(line.replace("Tom:","")\
                                    .replace("Anna:","")\
                                    .replace("Lynn: ","")\
                                    .replace("Jane: ","")\
                                    .replace("Human 1:","")\
                                    .replace("Human 2:","").strip()\
                                    .encode(encoding='ascii',errors='ignore').decode())
    return create_conversation_rows(conversation)

In [5]:
def parse_topical_chat_csv(filepath):
    '''
    Parses the CSV file from the Topical Chat dataset and returns a list of conversation rows
    '''
    #read in file
    df = pd.read_csv(filepath).dropna()
        
    newrows = df.groupby('conversation_id').apply((lambda grp: create_conversation_rows(grp['message'].tolist())))
    results = []
    for i in range(len(newrows)):
        for row in newrows.iloc[i]:
            results.append(row)
    return results

In [6]:
def parse_movie_dialogue_corpus():
    '''
    Parses the entire movie dialog corpus and returns a list of conversation rows
    '''
    with open('raw_data/movie dialog corpus/movie_lines.tsv',encoding='utf8') as f:
        data_lines = f.read().strip().split('\n')
        datarows = []
        for line in data_lines:
            elements = line.split('\t')
            row = {}
            row['lineID'] =  elements[0].replace('"',"")
            row['characterID'] = elements[1]
            row['movieID'] = elements[2]
            row['characterName'] = elements[3]
            row['text'] = " ".join(elements[4:])
            datarows.append(row)
        movie_lines_df = pd.DataFrame(datarows).set_index('lineID')
    movie_convos_df = pd.read_csv('raw_data/movie dialog corpus/movie_conversations.tsv', delimiter='\t',header=None)  
    movie_convos_df.columns = ['character1','character2','movieID','utterances']
    results = []
    for ix, row in movie_convos_df.iterrows():
        utterance = row['utterances']
        utterance = utterance.replace('[','').replace(']','').replace("'","").split(' ')
        conversation = []
        for lineID in utterance:
            conversation.append(movie_lines_df.loc[lineID]['text'])
        newrows = create_conversation_rows(conversation)
        for row in newrows:
            results.append(row)
    return results

## Step Two: Parse Dataset Files

In [7]:
conversations = []
conversations.extend(parse_conversation_txt("raw_data/conversations2.txt"))
conversations.extend(parse_conversation_txt("raw_data/conversations3.txt"))
conversations.extend(parse_conversation_txt("raw_data/human_chat.txt"))
conversations.extend(parse_conversation_json())
conversations.extend(parse_topical_chat_csv('raw_data/topical_chat.csv'))
conversations.extend(parse_movie_dialogue_corpus())

## Step Three: Save Training Data

In [8]:
final_df = pd.DataFrame(conversations, columns= ['response','context','context/0','context/1','context/2',
                                                 'context/3','context/4','context/5'])

In [9]:
final_df2 = final_df[(final_df['response'] != '') & 
         (final_df['context'] != '') &
         (final_df['context/0'] != '') &
         (final_df['context/1'] != '') &
         (final_df['context/2'] != '') &
         (final_df['context/3'] != '') &
         (final_df['context/4'] != '') &
         (final_df['context/5'] != '')]

In [10]:
final_df2.to_csv("final_conversations_dataset.csv",index=False)