In [None]:
### Installment of Pre-requisite packages and dependency alterations

# !pip uninstall -y numpy spacy gensim tensorflow langchain pytensor cupy-cuda12x

# !pip install convokit

In [None]:
# -*- coding: utf-8 -*-
"""
Initial Cleanup and Analysis of ConvoKit Dataset

This script downloads the 'conversations-gone-awry-corpus' from the ConvoKit toolkit,
performs basic preprocessing, and outputs a cleaned dataset for further analysis.

Author: Suvro Mukherjee
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from convokit import Corpus, download


In [None]:

# Set manual seed for reproducibility
torch.manual_seed(42)

# ===============================
# Download and Load ConvoKit Corpus
# ===============================
try:
    # Download the "conversations-gone-awry-corpus" dataset
    corpus = Corpus(filename=download("conversations-gone-awry-corpus"))
    print("Corpus successfully loaded.")
except Exception as e:
    print(f"An error occurred while loading the corpus: {e}")
    exit()

# Print basic statistics about the corpus
corpus.print_summary_stats()

# Print a random utterance for inspection
print("Sample random utterance:", corpus.random_utterance())


No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading conversations-gone-awry-corpus to /root/.convokit/saved-corpora/conversations-gone-awry-corpus
Downloading conversations-gone-awry-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/conversations-gone-awry-corpus/conversations-gone-awry-corpus.zip (45.2MB)... Done
Corpus successfully loaded.
Number of Speakers: 8069
Number of Utterances: 30021
Number of Conversations: 4188
Sample random utterance: Utterance(id: '340169701.90227.90227', conversation_id: 340143345.87989.87989, reply-to: 340168552.89465.89465, speaker: Speaker(id: 'Storm Rider', vectors: [], meta: ConvoKitMeta({})), timestamp: 1264528499.0, text: "COgden, there is no comparison between the BofM and a letter that has been proved to be forged. In one there is no

In [None]:
# ===============================
# Extract Utterances DataFrame
# ===============================
# Convert utterances to a DataFrame
conv_wiki = corpus.get_utterances_dataframe().reset_index()

# Add a new column to categorize comments as 'Personal Attack' or 'Normal'
conv_wiki['Comment Nature'] = conv_wiki['meta.comment_has_personal_attack'].apply(
    lambda x: 'Personal Attack' if x is True else 'Normal'
)

# Drop unnecessary columns
conv_wiki = conv_wiki.drop(
    columns=['reply_to', 'meta.is_section_header', 'meta.parsed', 'meta.comment_has_personal_attack', 'vectors']
)

# Rename columns for consistency and clarity
conv_wiki = conv_wiki.rename(
    columns={
        'id': 'Utterance-ID',
        'meta.toxicity': 'Toxicity'
    }
)

# Check the range of toxicity values
toxicity_range = conv_wiki['Toxicity'].max() - conv_wiki['Toxicity'].min()
print(f"Toxicity range: {toxicity_range}")

Toxicity range: 0.99723727


In [None]:
# ===============================
# Extract Conversations DataFrame
# ===============================
# Convert conversations to a DataFrame
conversation_df = corpus.get_conversations_dataframe().reset_index()

# Drop unnecessary columns
conversation_df = conversation_df.drop(
    columns=['meta.page_id', 'meta.page_title', 'vectors']
)

# Clean column names by removing the 'meta.' prefix
conversation_df.columns = conversation_df.columns.str.replace('meta.', '')

# ===============================
# Merge Utterances and Conversations
# ===============================
# Merge utterances and conversations data on the conversation ID
talk_df = pd.merge(conv_wiki, conversation_df, left_on='conversation_id', right_on='id')

# Preview the merged DataFrame
print("Preview of merged DataFrame:")
talk_df.head(5)

Preview of merged DataFrame:


Unnamed: 0,Utterance-ID,timestamp,text,speaker,conversation_id,Toxicity,Comment Nature,id,pair_id,conversation_has_personal_attack,verified,pair_verified,annotation_year,split
0,146743638.12652.12652,1185295934.0,== [WIKI_LINK: WP:COMMONNAME] ==\n,Sirex98,146743638.12652.12652,0.0,Normal,146743638.12652.12652,143890867.11926.11926,False,True,True,2018,train
1,146743638.12667.12652,1185277934.0,I notice that earier that moved wiki_link to ...,Sirex98,146743638.12652.12652,0.078141,Normal,146743638.12652.12652,143890867.11926.11926,False,True,True,2018,train
2,146842219.12874.12874,1185310317.0,"Chen was known in the poker world as ""William""...",2005,146743638.12652.12652,0.031784,Normal,146743638.12652.12652,143890867.11926.11926,False,True,True,2018,train
3,146860774.13072.13072,1185316241.0,I see what you saying I just read his pokersta...,Sirex98,146743638.12652.12652,0.030405,Normal,146743638.12652.12652,143890867.11926.11926,False,True,True,2018,train
4,143890867.11926.11926,1184144351.0,==List of slang terms for poker hands==\n,WilyD,143890867.11926.11926,0.0,Normal,143890867.11926.11926,146743638.12652.12652,True,True,True,2018,train


In [None]:
# ===============================
# Save Processed Data
# ===============================
# Save the processed DataFrame to a CSV file
output_file = 'compiled_data.csv'
talk_df.to_csv(output_file, index=False)
print(f"Augmented data saved to {output_file}")

Augmented data saved to compiled_data.csv
