# Load the dataset

In [17]:
import pandas as pd
data = pd.read_csv('../data/feedback_prize/feedback_prize_input.csv')
data.head()

Unnamed: 0,text,cat_label
0,Modern humans today are always on their phone....,Lead
1,They are some really bad consequences when stu...,Position
2,Some certain areas in the United States ban ph...,Evidence
3,"When people have phones, they know about certa...",Evidence
4,Driving is one of the way how to get around. P...,Claim


In [18]:
data['label'] = data.cat_label.astype('category').cat.codes
data.head()

Unnamed: 0,text,cat_label,label
0,Modern humans today are always on their phone....,Lead,4
1,They are some really bad consequences when stu...,Position,5
2,Some certain areas in the United States ban ph...,Evidence,3
3,"When people have phones, they know about certa...",Evidence,3
4,Driving is one of the way how to get around. P...,Claim,0


**Import data to a csv file**

In [19]:
data[['text', 'label']].to_csv('../data/clean/feedback_prize_7_labels.csv')

# Combine claims and premises in 2 labels

In [20]:
def combine_label(row):
    """
    Put claims and counterclaims together under 'claim' and the remaining labels under 'premise'
    """
    if row['cat_label'].lower() == 'claim' or row['cat_label'] == 'counterclaim':
        return 'claim'
    else:
        return 'premise'

In [21]:
data['com_label'] = data.apply(lambda row: combine_label(row), axis=1)
data.head()

Unnamed: 0,text,cat_label,label,com_label
0,Modern humans today are always on their phone....,Lead,4,premise
1,They are some really bad consequences when stu...,Position,5,premise
2,Some certain areas in the United States ban ph...,Evidence,3,premise
3,"When people have phones, they know about certa...",Evidence,3,premise
4,Driving is one of the way how to get around. P...,Claim,0,claim


In [22]:
data['label'] = data.com_label.astype('category').cat.codes
data.head()

Unnamed: 0,text,cat_label,label,com_label
0,Modern humans today are always on their phone....,Lead,1,premise
1,They are some really bad consequences when stu...,Position,1,premise
2,Some certain areas in the United States ban ph...,Evidence,1,premise
3,"When people have phones, they know about certa...",Evidence,1,premise
4,Driving is one of the way how to get around. P...,Claim,0,claim


In [23]:
data.drop(['cat_label', 'com_label'], axis=1, inplace=True)
data.head()

Unnamed: 0,text,label
0,Modern humans today are always on their phone....,1
1,They are some really bad consequences when stu...,1
2,Some certain areas in the United States ban ph...,1
3,"When people have phones, they know about certa...",1
4,Driving is one of the way how to get around. P...,0


In [25]:
count_df = data.groupby('label').count()

In [27]:
count_df['portion'] = count_df['text'] / count_df['text'].sum()
count_df

Unnamed: 0_level_0,text,portion
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50208,0.347959
1,94085,0.652041


>=> Our data is mildly imbalanced (size of minority class between 20% and 40%) [link](https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data#:~:text=A%20classification%20data%20set%20with,smaller%20proportion%20are%20minority%20classes.)

**Import data to a csv file**

In [9]:
data.to_csv('../data/clean/feedback_prize_2_labels.csv', index=False)