In [72]:
import pandas as pd
import numpy as np

In [41]:
df = pd.read_csv('data_2015_to_2021_with_topics.csv')
wiki_df = pd.read_csv('data_2015_to_2021_with_context.csv')

In [42]:
# merge the two dataframes using index
df = df.merge(wiki_df, left_index=True, right_index=True)

In [43]:
df = df[['comments', 'context']]

In [44]:
# drop rows with context as empty string
df = df[df['context'] != '']
df = df[df['context'] != 'nan']

In [45]:
import ast
df['comments'] = df['comments'].apply(ast.literal_eval)

In [46]:
# filter rows that does not have topic in the comments
z = lambda x: 'topic' in x['comments'][0]

df = df[df.apply(z, axis=1)]

In [89]:
# pivot the dataframe to have one comment per row
df_new = df.explode('comments')

In [90]:
df_new['context_index'] = df_new.index

In [91]:
# create a new column label and randomly assign 1 or 0 to it
import random
df_new['label'] = df_new['context_index'].apply(lambda x: random.randint(0, 1))

In [92]:
# if label is 1, do not change the context
# if label is 0, change the context to a random context using the context_index column and df
def change_context(row):
    if row['label'] == 0:
        incorrect_context_index = row['context_index']
        while incorrect_context_index == row['context_index']:
            incorrect_context_index = random.randint(0, len(df) - 1)
        row['context'] = df.iloc[incorrect_context_index]['context']
    return row

df_new = df_new.apply(change_context, axis=1)

In [93]:
# drop rows with context as NaN
df_new = df_new.dropna(subset=['context'])

In [94]:
# drop rows that have comments with no topic
df_new = df_new[df_new['comments'].apply(lambda x: 'topic' in x)]

In [95]:
df_new['question'] = df_new['comments'].apply(lambda x: x['body'])
df_new['topic'] = df_new['comments'].apply(lambda x: x['topic'])

# strip whitespaces from the topic
df_new['topic'] = df_new['topic'].apply(lambda x: x.strip())


In [97]:
# df_new[['question', 'topic']].to_csv('question_ranking_data.csv', index=False)
question_ranking_data = df_new[['question', 'context', 'label']].copy()

# reset the index
question_ranking_data.reset_index(drop=True, inplace=True)

# split the dataframe into train and val using pandas
train = question_ranking_data.sample(frac=0.5, random_state=42)
val = question_ranking_data.drop(train.index)

# save the train and val dataframes
train.to_csv('question_ranking_data_train.csv', index=False)
val.to_csv('question_ranking_data_val.csv', index=False)

In [38]:
df_new['text'] = df_new.apply(lambda x: f'<topic> {x["topic"]} <context> {x["context"]}', axis=1)

In [39]:
# df_new[['question', 'text']].to_csv('question_generator.csv', index=False)
question_generator = df_new[['question', 'text']].copy()

# reset the index
question_generator.reset_index(drop=True, inplace=True)

# split the dataframe into train and val using pandas
train = question_generator.sample(frac=0.5, random_state=42)
val = question_generator.drop(train.index)

# save the train and val dataframes
train.to_csv('question_generator_train.csv', index=False)
val.to_csv('question_generator_val.csv', index=False)