# Dataset Preparation
This notebook is meant to prepare the dataset. 

## Libraries

In [1]:
## Libraries
import json
import pandas as pd

### Load the dataset

In [2]:
with open('data//data_train.json', 'r') as file:
    dataset = json.load(file)

### Claims Dataset
This dataset contains the following columns:
- claim: the claim made by the speaker
- label: the label of the claim
- justification: the justification of the claim
- claim_date: the date the claim was made
- speaker: the speaker of the claim
- reporting_source: the source of the claim
- location_ISO_code: the ISO code of the location where the claim was made

In [3]:
# Build the claims' dataset
claims = [{'claim':claim['claim'], 'label': claim['label'], 'justification':claim['justification'], 'claim_date':claim['claim_date'], 'speaker':claim['speaker'], 'reporting_source':claim['reporting_source'], 'location_ISO_code':claim['location_ISO_code']} for claim in dataset]
claims

[{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.',
  'label': 'Supported',
  'justification': 'No former experience stated.',
  'claim_date': '25-8-2020',
  'speaker': 'Pam Bondi',
  'reporting_source': 'Speech at The Republican National Convention',
  'location_ISO_code': 'US'},
 {'claim': 'Donald Trump delivered the largest tax cuts in American history.',
  'label': 'Refuted',
  'justification': 'Three tax bills have been larger than that of Donald Trump',
  'claim_date': '25-8-2020',
  'speaker': 'Eric Trump',
  'reporting_source': 'Speech at The Republican National Convention',
  'location_ISO_code': 'US'},
 {'claim': 'In Nigeria … in terms of revenue share, 20% goes to the local government.',
  'label': 'Supported',
  'justification': 'The answer and source shows that the claim is supported as the percentage to local government is the same.',
  'claim_date': '25-8-2020',
  'speaker': 'Raila Odinga',
  'reporting_sou

In [4]:
claims_df = pd.DataFrame(claims)

In [5]:
claims_df.head()

Unnamed: 0,claim,label,justification,claim_date,speaker,reporting_source,location_ISO_code
0,Hunter Biden had no experience in Ukraine or i...,Supported,No former experience stated.,25-8-2020,Pam Bondi,Speech at The Republican National Convention,US
1,Donald Trump delivered the largest tax cuts in...,Refuted,Three tax bills have been larger than that of ...,25-8-2020,Eric Trump,Speech at The Republican National Convention,US
2,"In Nigeria … in terms of revenue share, 20% go...",Supported,The answer and source shows that the claim is ...,25-8-2020,Raila Odinga,YouTube,KE
3,Biden has pledged to stop border wall construc...,Supported,This claim should have been split into two par...,25-8-2020,Eric Trump,Speech at The Republican National Convention,US
4,"After the police shooting of Jacob Blake, Gov....",Refuted,Governor Evers did call for peace in a video s...,25-8-2020,Senator Howard Marklein,Facebook,US


In [7]:
# Convert to datetime the 'claim_date' column
claims_df['claim_date'] = pd.to_datetime(claims_df['claim_date'], format='%d-%m-%Y')

In [8]:
claims_df.to_csv('./data/postprocessed/claims.csv', index=True)

### Answers Dataset
This dataset contains the following columns:
- answer: the answer to the question
- source_medium: the medium where the answer was found
- source_url: the url of the source
- question: the question asked
- claim_id: the id of the claim

In [9]:
answers = []
claim_id = 0
for claim in dataset:
    for question in claim['questions']:
        for answer in question['answers']:
            if answer['answer_type'] == 'Boolean':
                answers.append({'answer':answer['boolean_explanation'], 'source_medium':answer['source_medium'], 'source_url':answer['source_url'], 'question':question['question'], 'claim_id':claim_id})
            else:
                answers.append({'answer':answer['answer'], 'source_medium':answer['source_medium'], 'source_url':answer['source_url'], 'question':question['question'], 'claim_id':claim_id})
    claim_id += 1
answers

[{'answer': "Hunter bidens previous career history does not include work for energy company's.",
  'source_medium': 'Web text',
  'source_url': 'https://en.wikipedia.org/wiki/Hunter_Biden',
  'question': 'Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014',
  'claim_id': 0},
 {'answer': "Hunter Bidens previous career history does not include working with Ukrainian company's.",
  'source_medium': 'Web text',
  'source_url': 'https://en.wikipedia.org/wiki/Hunter_Biden',
  'question': 'Did Hunter Biden have any experience in Ukraine at the time he joined the board of the  Burisma energy company in 2014',
  'claim_id': 0},
 {'answer': 'This tax cut is the 8th largest as a percent of Gross Domestic Product (GDP) since 1918 and the 4th largest in inflation-adjusted dollars.',
  'source_medium': 'Web text',
  'source_url': 'https://www.crfb.org/blogs/president-trumps-tax-cut-largest-history-yet',
  'question': 'Did 

In [10]:
answers_df = pd.DataFrame(answers)

In [11]:
answers_df.head()

Unnamed: 0,answer,source_medium,source_url,question,claim_id
0,Hunter bidens previous career history does not...,Web text,https://en.wikipedia.org/wiki/Hunter_Biden,Did Hunter Biden have any experience in the en...,0
1,Hunter Bidens previous career history does not...,Web text,https://en.wikipedia.org/wiki/Hunter_Biden,Did Hunter Biden have any experience in Ukrain...,0
2,This tax cut is the 8th largest as a percent o...,Web text,https://www.crfb.org/blogs/president-trumps-ta...,Did the 2017 tax bill deliver the largest tax ...,1
3,Three tax bills have been larger: American Tax...,Web table,https://www.treasury.gov/resource-center/tax-p...,Has there been larger tax bills than the 2017 ...,1
4,"After months of an often fractious stalemate, ...",Web text,https://web.archive.org/web/20210307003741/htt...,Kenya’s ex-prime minister Odinga mangles his n...,2


In [12]:
answers_df.to_csv('./data/postprocessed/answers.csv', index=True)