In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

## Load the datasets
We have two parts to the dataset, `*_stances.csv` contains individual records pointing to a body (there are less bodies than records). `*_bodies.csv` contains the bodies of the articles in a one-to-many relationship with the records in the stances file.

In [3]:
bodies_df = pd.read_csv("./dataset/train_bodies.csv")
records_df = pd.read_csv("./dataset/train_stances.csv")

View the head of each loaded dataset

In [4]:
bodies_df.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [5]:
len(bodies_df)

1683

In [6]:
records_df.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [7]:
len(records_df)

49972

In [8]:
records_df["Body ID"].nunique()

1683

## Merge datasets
The two datasets can be merged to associate the bodies with each record

In [9]:
df = pd.merge(records_df, bodies_df, how="left", on="Body ID")

In [10]:
df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\r\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [11]:
df.tail()

Unnamed: 0,Headline,Body ID,Stance,articleBody
49967,Urgent: The Leader of ISIL 'Abu Bakr al-Baghda...,1681,unrelated,(CNN) -- Boko Haram laughed off Nigeria's anno...
49968,Brian Williams slams social media for speculat...,2419,unrelated,Along with unveiling the Apple Watch earlier t...
49969,Mexico Says Missing Students Not Found In Firs...,1156,agree,The bodies found in a mass grave were confirme...
49970,US Lawmaker: Ten ISIS Fighters Have Been Appre...,1012,discuss,Rep. Duncan Hunter (R-CA) told Greta Van Suste...
49971,Shots Heard In Alleged Brown Shooting Recordin...,2044,unrelated,A married TV actor who met a young woman and k...


In [12]:
len(df)

49972

In [13]:
df["Headline"].nunique()

1648

In [14]:
df["articleBody"].nunique()

1669

In [24]:
body_ids = list(df["Body ID"].unique())
len(body_ids)

1683

In [28]:
def split(ids, left_ratio):
    left_split_size = int(len(ids) * left_ratio)
    right_split_size = len(ids) - left_split_size
    left_indexes = random.sample(range(0, len(ids)), left_split_size)
    
    left_split = [ids[i] for i in left_indexes]
    
    right_indexes = set(range(0, len(ids))) - set(left_indexes)
    right_split = [ids[i] for i in right_indexes]
    
    return left_split, right_split

In [30]:
left_body_ids, right_body_ids = split(body_ids, 0.8)

In [32]:
len(left_body_ids) == len(set(left_body_ids))

True

In [33]:
len(right_body_ids) == len(set(right_body_ids))

True

In [42]:
len(set(left_body_ids) & set(right_body_ids)) == 0

True

In [38]:
train_valid_df = df.loc[df["Body ID"].isin(left_body_ids)]

In [37]:
test_df = df.loc[df["Body ID"].isin(right_body_ids)]

In [39]:
len(train_valid_df)

39978

In [40]:
len(test_df)

9994

In [41]:
train_valid_df["Headline"].nunique()

1648

In [43]:
test_df["Headline"].nunique()

1588