# Combine Post History, Comments and Answer datasets. Order by Creating Date

In [1]:
import pandas as pd
import os
import csv

In [12]:
# 1. Load the CSV files
history_df = pd.read_csv("./data/raw/test/Post_History_90178.csv")
comments_df = pd.read_csv("./data/raw/test/Post_Comments_90178.csv")
answers_df = pd.read_csv("./data/raw/test/Post_Answer_90178.csv")

# 2. Rename ParentId to PostId in answers for consistency
answers_df = answers_df.rename(columns={'ParentId': 'PostId'})

# 3. Add a 'Name' column to indicate the type of each entry
comments_df['Name'] = "Comments"
answers_df['Name'] = "Answers"

# 4. Rename 'Body' to 'Text' in answers to unify text columns
answers_df = answers_df.rename(columns={'Body': 'Text'})

# Add a 'Score' column to history and 
history_df['Score'] = pd.NA

# Reorder all columns to match each other to prepare for concantenate
history_df = history_df[['PostId', 'CreationDate', 'Name', 'Score', 'Text']]
comments_df = comments_df[['PostId', 'CreationDate', 'Name', 'Score', 'Text']]
answers_df = answers_df[['PostId', 'CreationDate', 'Name', 'Score', 'Text']]

# Concatenate all data
combined_df = pd.concat([history_df, comments_df, answers_df], ignore_index=True)


In [13]:
from IPython.display import display
display(combined_df)

Unnamed: 0,PostId,CreationDate,Name,Score,Text
0,90178,2008-09-18 05:06:17,Initial Title,,Make a div fill the remaining screen space
1,90178,2008-09-18 05:06:17,Initial Body,,"I am currently working on a web application, w..."
2,90178,2008-09-18 05:06:17,Initial Tags,,<table><css><html>
3,90178,2008-09-18 05:29:36,Edit Body,,"I am currently working on a web application, w..."
4,90178,2008-09-18 07:54:17,Edit Body,,"I am currently working on a web application, w..."
...,...,...,...,...,...
65,90178,2022-11-03 14:00:36,Answers,4,<p>For me the easiest way to do this is by usi...
66,90178,2023-01-15 02:03:30,Answers,1,<p>My method makes use of <code>calc()</code> ...
67,90178,2023-02-12 23:06:54,Answers,14,"<p>Try this way:</p>\n<p><div class=""snippet"" ..."
68,90178,2023-02-25 18:18:58,Answers,-4,<p>After calculating the pixels of your conten...


In [16]:
# Ensure CreationDate is a datetime object and sort by it
combined_df['CreationDate'] = pd.to_datetime(combined_df['CreationDate'])
combined_df = combined_df.sort_values(by='CreationDate').reset_index(drop=True)

# Save to CSV (optional)
combined_df.to_csv("./data/raw/test/Combined_90178.csv", index=False)
