### Imports

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

### Parse the Posts.xml. 

Note: This step may take around 20-30 mins

In [2]:
tree = ET.parse('Posts.xml')

### Extract necessary fields from xml

In [3]:
root = tree.getroot()

In [105]:
pt_title = []
pt_post_id = []
pt_post_type_id = []
pt_accepted_answer_id = []
pt_creation_date = []
pt_score = []
pt_body = []
pt_tags = []
pt_parent_id = []

# Iterate through the elements in the XML file
for post in tqdm(root):
    # Extract the necessary information
    post_id = post.get('Id')
    post_type_id = post.get('PostTypeId')
    accepted_answer_id = post.get('AcceptedAnswerId')
    creation_date = post.get('CreationDate')
    score = post.get('Score')
    body = post.get('Body')
    tag = post.get("Tags")
    title = post.get("Title")
    parent_id = post.get("ParentId")
    
    # Print the extracted information
#     if tag and "pytorch" in tag:
#         print('Post ID:', post_id)
#         print('Post Type ID:', post_type_id)
#         print('Accepted Answer ID:', accepted_answer_id)
#         print('Creation Date:', creation_date)
#         print('Score:', score)
#         print('Body:', body)
#         print("Tags: ", tag)
    pt_post_id.append(post_id)
    pt_post_type_id.append(post_type_id)
    pt_accepted_answer_id.append(accepted_answer_id)
    pt_creation_date.append(creation_date)
    pt_score.append(score)
    pt_body.append(body)
    pt_tags.append(tag)
    pt_title.append(title)
    pt_parent_id.append(parent_id)
    
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57721548/57721548 [02:00<00:00, 480581.42it/s]


In [106]:
len(pt_post_id)

57721548

In [107]:
questions_df = pd.DataFrame({
    "pt_post_id" : pt_post_id,
    "pt_post_type_id": pt_post_type_id,
    "pt_accepted_answer_id": pt_accepted_answer_id,
    "pt_creation_date": pt_creation_date,
    "pt_score": pt_score,
    "pt_title": pt_title,
    "pt_body": pt_body,
    "pt_tags": pt_tags,
    "pt_parent_id": pt_parent_id
})

In [108]:
questions_df.shape

(57721548, 9)

In [188]:
questions_df[questions_df.pt_post_type_id == "1"].shape

(23273009, 9)

### Query and extract pytorch posts

In [109]:
questions_df["pt_tags"] = questions_df["pt_tags"].str.lower()

In [110]:
pt_questions_df = questions_df[(questions_df["pt_tags"].notna()) & (questions_df["pt_tags"].str.contains("pytorch"))]

### Lets split the dataframe into two parts

1. Questions which has accepted answer
2. Questions which doesn't have accepted answers

In [111]:
questions_with_accepted_df = pt_questions_df[pt_questions_df.pt_accepted_answer_id.notna()]
questions_without_accepted_df = pt_questions_df[pt_questions_df.pt_accepted_answer_id.isnull()]

In [112]:
assert pt_questions_df.shape[0] == (questions_with_accepted_df.shape[0] + questions_without_accepted_df.shape[0])

### For Each question in questions_with_accepted_df, lets query the database and find out the accepted answer post. 

To find the accepted answer from db, we need to query `AcceptedAnswerId` from question and search for `Id` as `AcceptedAnswerId` and `PostTypeId` == "2"

In [113]:
questions_with_accepted_df.shape

(7558, 9)

In [114]:
accepted_answer_df = questions_df[questions_df.pt_post_id.isin(questions_with_accepted_df.pt_accepted_answer_id)]

In [115]:
assert questions_with_accepted_df.shape[0] == accepted_answer_df.shape[0]

In [116]:
accepted_answer_body_list = accepted_answer_df.pt_body.tolist()

In [117]:
questions_with_accepted_df["pt_answer"] = accepted_answer_body_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_with_accepted_df["pt_answer"] = accepted_answer_body_list


In [118]:
questions_with_accepted_df.head()

Unnamed: 0,pt_post_id,pt_post_type_id,pt_accepted_answer_id,pt_creation_date,pt_score,pt_title,pt_body,pt_tags,pt_parent_id,pt_answer
27837861,34750268,1,34762233,2016-01-12T17:36:25.473,9,Extracting the top-k value-indices from a 1-D ...,<p>Given a 1-D tensor in Torch (<code>torch.Te...,<python><lua><pytorch><torch>,,"<p>As of pull request <a href=""https://github...."
30769673,38543850,1,38676842,2016-07-23T16:15:43.967,40,How to Display Custom Images in Tensorboard (e...,"<p>The <a href=""https://github.com/tensorflow/...",<python><tensorflow><matplotlib><pytorch><tens...,,<p>It is quite easy to do if you have the imag...
33236300,41767005,1,43824857,2017-01-20T15:22:08.063,11,Python wheels: cp27mu not supported,"<p>I'm trying to install pytorch (<a href=""htt...",<python><linux><unicode><pytorch>,,"<p>Yes, that is possible. Just create the obje..."
33307877,41861354,1,54261158,2017-01-25T20:45:35.297,8,Loading Torch7 trained models (.t7) in PyTorch,<p>I am using Torch7 library for implementing ...,<python><lua><pytorch><torch><pre-trained-model>,,<p><code>view()</code> reshapes the tensor wit...
33355427,41924453,1,42054194,2017-01-29T18:31:24.687,65,PyTorch: How to use DataLoaders for custom Dat...,<p>How to make use of the <code>torch.utils.da...,<python><torch><pytorch>,,<p>While you will not get as detailed informat...


### For questions without accepted answers

#### From the whole dump lets extract the rows which contains only answers

In [120]:
answers_df = questions_df[questions_df.pt_post_type_id == "2"]

In [121]:
answers_df.shape

(34337401, 9)

#### Extract the rows where the parent id matches the post id of not accepted df

In [162]:
questions_without_accepted_df.shape

(12279, 9)

In [163]:
answers_not_accepted_df = answers_df[answers_df.pt_parent_id.isin(questions_without_accepted_df.pt_post_id)]

In [164]:
answers_not_accepted_df.shape

(8596, 9)

In [165]:
answers_not_accepted_df.pt_parent_id.nunique()

6638

In [166]:
answers_not_accepted_df.sort_values(by=["pt_score"], inplace=True, ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers_not_accepted_df.sort_values(by=["pt_score"], inplace=True, ascending=False)


In [167]:
answers_not_accepted_df["pt_score"] = answers_not_accepted_df["pt_score"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers_not_accepted_df["pt_score"] = answers_not_accepted_df["pt_score"].astype(int)


In [168]:
parent_id_with_max_score_df = answers_not_accepted_df[answers_not_accepted_df.pt_score>=0].groupby("pt_parent_id", sort=False)["pt_score"].max().reset_index()



In [169]:
parent_id_with_max_score_df.columns

Index(['pt_parent_id', 'pt_score'], dtype='object')

In [170]:
answers_not_accepted_df = answers_not_accepted_df.merge(parent_id_with_max_score_df, on=['pt_parent_id', 'pt_score'], how="inner")[["pt_parent_id", "pt_body"]]

In [171]:
answers_not_accepted_df.rename(columns={"pt_parent_id": "pt_post_id", "pt_body": "pt_answer"}, inplace=True)

In [173]:
answers_not_accepted_df.shape

(7035, 2)

In [176]:
answers_without_accepted_df = questions_without_accepted_df.merge(answers_not_accepted_df, on="pt_post_id", how="inner")

In [178]:
answers_without_accepted_df.shape

(7035, 10)

In [180]:
questions_with_accepted_df.shape

(7558, 10)

In [181]:
questions_with_accepted_df.columns

Index(['pt_post_id', 'pt_post_type_id', 'pt_accepted_answer_id',
       'pt_creation_date', 'pt_score', 'pt_title', 'pt_body', 'pt_tags',
       'pt_parent_id', 'pt_answer'],
      dtype='object')

In [182]:
answers_without_accepted_df.columns

Index(['pt_post_id', 'pt_post_type_id', 'pt_accepted_answer_id',
       'pt_creation_date', 'pt_score', 'pt_title', 'pt_body', 'pt_tags',
       'pt_parent_id', 'pt_answer'],
      dtype='object')

In [183]:
pt_question_answers_df = pd.concat([questions_with_accepted_df, answers_without_accepted_df], axis=0)

In [185]:
pt_question_answers_df.to_csv("pt_question_answers.csv")