In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
train_stances = pd.read_csv("./dataset/train_stances.csv")
train_bodies = pd.read_csv("./dataset/train_bodies.csv")

test_stances = pd.read_csv("./dataset/competition_test_stances.csv")
test_bodies = pd.read_csv("./dataset/competition_test_bodies.csv")

In [3]:
train_stances["Headline"].value_counts()

ISIL Beheads American Photojournalist in Iraq                                                      127
WHO says reports of suspected Ebola cases in Iraq are untrue                                       124
James Foley remembered as 'brave and tireless' journalist                                          121
Islamic Militants Post Video Claiming to Show Beheading of U.S. Journalist                         118
US officials: Video shows American's execution                                                     112
                                                                                                  ... 
Apple hopes to sell over 50 million watches in 2015                                                  2
Mom Calls 911 On Masturbating Teenage Son; Boy Arrested, Charged With New ‘Self-Rape’ State Law      2
Federal Judge: Enough With the Stupid Names                                                          2
Sushi lover's entire body left riddled with WORMS after eating contaminat

In [4]:
test_stances["Headline"].value_counts()

Source: Joan Rivers' doc did biopsy, selfie                                                                                                                  160
Joan Rivers Personal Doctor Allegedly Took A Selfie Before Her Biopsy And Cardiac Arrest                                                                     138
‘Three-boobed’ woman: They’re not fake                                                                                                                        92
Adopting Potential Werewolves Is Routine Business for Argentine Presidents                                                                                    92
Justin Bieber Basically Saves A Russian Man From A Bear                                                                                                       88
                                                                                                                                                            ... 
Madonna pledges oral sex for Clint

In [5]:
train_stances.loc[train_stances["Headline"].isin(test_stances["Headline"])]

Unnamed: 0,Headline,Body ID,Stance
173,Cheese addiction breaks Kim Jong-un's ankles,2210,unrelated
1415,WSJ: Apple cut watch health features due to er...,1917,discuss
1418,Cheese blamed for North Korean leader Kim Jong...,1689,unrelated
1479,Cheese blamed for North Korean leader Kim Jong...,186,unrelated
1503,Cheese addiction breaks Kim Jong-un's ankles,2329,unrelated
...,...,...,...
48270,Apple was forced to nix key health features fr...,407,discuss
48852,Cheese blamed for North Korean leader Kim Jong...,2042,unrelated
49279,Cheese addiction breaks Kim Jong-un's ankles,1854,discuss
49349,Cheese blamed for North Korean leader Kim Jong...,2344,unrelated


In [6]:
train_stances.loc[train_stances["Body ID"].isin(test_stances["Body ID"])]

Unnamed: 0,Headline,Body ID,Stance


So there is overlap of the headlines in both the training and the test data but there is no overlap with the bodies of the articles so we should split the training set into training and validation such that the bodies of the articles are disjoint.

In [7]:
test_df = test_stances.merge(test_bodies, on="Body ID")

In [8]:
train_and_val_df = train_stances.merge(train_bodies, on="Body ID")

In [9]:
val_split_ratio = 0.2

In [10]:
def split_train_val(df, ratio):
    val_count = int(ratio * df["Body ID"].nunique())
    all_ids = list(df["Body ID"].unique())
    val_body_ids = random.sample(all_ids, val_count)
    train_body_ids = set(all_ids) - set(val_body_ids)
    
    assert len(set(val_body_ids) & train_body_ids) == 0
    
    val_df = df.loc[df["Body ID"].isin(val_body_ids)]
    train_df = df.loc[df["Body ID"].isin(train_body_ids)]
    
    return val_df, train_df

In [39]:
val_df, train_df = split_train_val(train_and_val_df, val_split_ratio)

In [40]:
val_df["Body ID"].nunique()

336

In [41]:
test_df["Body ID"].nunique()

904

In [42]:
train_df["Body ID"].nunique()

1347

In [43]:
def prepare_df(df):
    df = df.drop("Body ID", axis=1)
    df = df.reset_index()
    df = df.drop("index", axis=1)
    df["Related"] = df["Stance"] != "unrelated"
    return df

In [44]:
val_df = prepare_df(val_df)
train_df = prepare_df(train_df)
test_df = prepare_df(test_df)

In [54]:
val_df["Stance"].value_counts(normalize=True)#.plot(kind="bar")

unrelated    0.732794
discuss      0.169002
agree        0.081595
disagree     0.016610
Name: Stance, dtype: float64

In [55]:
train_df["Stance"].value_counts(normalize=True)#.plot(kind="bar")

unrelated    0.730955
discuss      0.180495
agree        0.071692
disagree     0.016857
Name: Stance, dtype: float64

In [58]:
test_df["Stance"].value_counts(normalize=True)#.plot(kind="bar")

unrelated    0.722032
discuss      0.175658
agree        0.074883
disagree     0.027427
Name: Stance, dtype: float64

We now have the train-val-test splits sorted out with fair representation in the train and val (test doesn't matter as we can't control it, but it is approx equal anyway)