# Setup Notebook

## Mount to Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Install dependencies

## Load Imports

In [9]:
import numpy as np
import pandas as pd

from pathlib import Path

# Configuration File

In [None]:
config = {}

# Train Model

## Load Data

In [13]:
def read_data(path: str, name: str):
    '''Reads csv file
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
        
    Returns:
        pandas.core.frame.DataFrame of joined bodies and stances        
    '''
    df = pd.read_csv(f'{path}/{name}.csv')
    return df

In [14]:
def extract_columns(df: pd.core.frame.DataFrame, columns: list = ['Headline', 'articleBody', 'Stance'],
                    new_columns: list = ['text_a', 'text_b', 'labels']):
    '''Load certain columns of dataframe and transform to desired format
    
    Args:
        df (pd.core.frame.DataFrame): DataFrame of extracted columns
        columns (list): Column names to extract
        new_columns (list): New column names
    
    Returns:
        Extracted dataframe with new columns
    '''
    processed_df = df[columns]
    processed_df.columns = new_columns
    return processed_df

In [15]:
def encod_labels(df):
    '''Encod label strings to ints
    
    Args:
        df (pd.core.frame.DataFrame): DataFrame to encod labels
    
    Returns:
        df (pd.core.frame.DataFrame) with encoded labels
    '''
    labels = df['labels'].unique()
    label_dict = {}
    
    for i, l in enumerate(labels):
        label_dict[l] = i
    
    encoded_df = df.replace({"labels": label_dict})
    return encoded_df, label_dict

In [16]:
def load_data(path: str, name: str):
    '''Read and process csv to desired format
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
    
    Returns:
        Preprocessed data
    '''
    df = read_data(path, name)
    processed_df = extract_columns(df)
    encoded_df, l2e = encod_labels(processed_df)
    return encoded_df, l2e

In [17]:
path = Path('gdrive/MyDrive/Dataset/msci/project/')

In [18]:
! ls {path}

test.csv  train.csv


In [21]:
train, l2e = load_data(path, 'train')
test, _ = load_data(path, 'test')
print(l2e)

{'unrelated': 0, 'agree': 1, 'disagree': 2, 'discuss': 3}


In [22]:
train.head()

Unnamed: 0,text_a,text_b,labels
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,0
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,1
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,0
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,0
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",2


In [23]:
test.head()

Unnamed: 0,text_a,text_b,labels
0,Ferguson riots: Pregnant woman loses eye after...,A RESPECTED senior French police officer inves...,0
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,Dave Morin's social networking company Path is...,0
2,A Russian Guy Says His Justin Bieber Ringtone ...,A bereaved Afghan mother took revenge on the T...,0
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",Hewlett-Packard is officially splitting in two...,0
4,Argentina's President Adopts Boy to End Werewo...,An airline passenger headed to Dallas was remo...,0
