In [1]:
# process raw data -- remove bad examples and nan values
import pandas as pd
data = pd.read_csv("data/belief_data_raw.csv", usecols=["text", "sentiment"])
data = data[:466]
data = data.drop(data[data["sentiment"] == "bad example"].index)
data = data.dropna()
data

Unnamed: 0,text,sentiment
0,Also use of chemicals and machinery on their p...,"NEGATIVE, NEGATIVE"
1,Sources believe that twenty-five to 30 percent...,UNDETERMINED
2,Although these government-led initiatives are ...,UNDETERMINED
4,"In other projects of this kind, it is not alwa...",POSITIVE
7,Many African policy makers evidently believe t...,POSITIVE
...,...,...
460,Similar initiatives have and can be initiated ...,POSITIVE
461,Collective sales account for an average of 64 ...,UNDETERMINED
462,Individual sales account for slightly lower vo...,"UNDETERMINED, UNDETERMINED, UNDETERMINED"
464,Modernization of the rice value chain The VC t...,UNDETERMINED


In [2]:
# assign labels as idxs to each sentiment
labels = {0: "neither", 1: "neg", 2: "pos", 3: "both"}

def sentiment_to_label(row):
    sentiment = row['sentiment']
    if "POSITIVE" in sentiment and "NEGATIVE" in sentiment:
        return 3
    elif "POSITIVE" in sentiment:
        return 2
    elif "NEGATIVE" in sentiment:
        return 1
    return 0
data['label'] = data.apply(sentiment_to_label, axis=1)

In [3]:
# show class counts
print(data['label'].value_counts()[0])
print(data['label'].value_counts()[1])
print(data['label'].value_counts()[2])
print(data['label'].value_counts()[3])
data = data.reset_index(drop=True)
data.head()

122
65
147
29


Unnamed: 0,text,sentiment,label
0,Also use of chemicals and machinery on their p...,"NEGATIVE, NEGATIVE",1
1,Sources believe that twenty-five to 30 percent...,UNDETERMINED,0
2,Although these government-led initiatives are ...,UNDETERMINED,0
3,"In other projects of this kind, it is not alwa...",POSITIVE,2
4,Many African policy makers evidently believe t...,POSITIVE,2


In [4]:
# save processed data to csv
data.to_csv("data/belief_benchmark.csv", index=False)

In [5]:
# create static holdout and train sets. stratified sampling to account for imbalanced classes
# train datset is 60% of all data, holdout is the other 40%
from sklearn.model_selection import train_test_split
holdout_size = 0.4
all_train_df, holdout_df = train_test_split(data, test_size=holdout_size, random_state=1, stratify=data[['label']])

print(len(all_train_df))
print(len(holdout_df))
print(all_train_df['label'].value_counts())
print(holdout_df['label'].value_counts())

all_train_df.to_csv("data/belief_benchmark_all_train.csv", index=False)
holdout_df.to_csv("data/belief_benchmark_holdout.csv", index=False)

217
146
2    88
0    73
1    39
3    17
Name: label, dtype: int64
2    59
0    49
1    26
3    12
Name: label, dtype: int64
