# Majority voting
In this notebook a majority voting is used to improve results on the kaggle dataset. It uses three models.
Namely bertweet, xlnet and robertya. bertweet can be swapped with bertweet leveraging tweet hashtags.
To use this the submission files from these models should be already available in a submissions folder in the root of this repository.

## Imports

In [1]:

import os
import sys

import numpy as np
import pandas as pd

## Read data

In [2]:
BERTWEET_SUBMISSION_FILE = '../../submissions/bertweet_submission.csv'
BERTWEET_HASHTAGS_SUBMISSION_FILE = '../../submissions/bertweet_hashtag_submission.csv'
XLNET_SUBMISSION_FILE = '../../submissions/xlnet_submission.csv'
ROBERTA_SUBMISSION_FILE = '../../submissions/roberta_submission.csv'

MAJORITY_VOTING_SUBMISSION_FILE = 'majority_voting_submission.csv'

In [3]:
xlnet_predictions_df = pd.read_csv(XLNET_SUBMISSION_FILE, names=['Id', 'xlnet_predictions'], dtype = {'Id': int, 'Prediction': int}, delimiter=',', header=None, skiprows=1, index_col='Id')
xlnet_predictions_df.head()

Unnamed: 0_level_0,xlnet_predictions
Id,Unnamed: 1_level_1
1,-1
2,-1
3,-1
4,1
5,-1


In [4]:
roberta_predictions_df = pd.read_csv(ROBERTA_SUBMISSION_FILE, names=['Id', 'roberta_predictions'], dtype = {'Id': int, 'Prediction': int}, delimiter=',', header=None, skiprows=1, index_col='Id')
roberta_predictions_df.head()

Unnamed: 0_level_0,roberta_predictions
Id,Unnamed: 1_level_1
1,-1
2,-1
3,-1
4,1
5,-1


In [5]:
bertweet_predictions_df = pd.read_csv(BERTWEET_SUBMISSION_FILE, names=['Id', 'bertweet_predictions'], dtype = {'Id': int, 'Prediction': int}, delimiter=',', header=None, skiprows=1, index_col='Id')
bertweet_predictions_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../submissions/submission_bertweet.csv'

In [None]:
predictions_df = bertweet_predictions_df.join(roberta_predictions_df, on='Id').join(xlnet_predictions_df, on='Id')
predictions_df.head()

Unnamed: 0_level_0,bertweet_predictions,roberta_predictions,xlnet_predictions
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-1,-1,-1
2,-1,-1,-1
3,-1,-1,-1
4,1,1,1
5,-1,-1,-1


## Majority voting

In [None]:
sum_df = predictions_df.apply(np.sum, axis=1)
sum_df.head()

Id
1   -3
2   -3
3   -3
4    3
5   -3
dtype: int64

In [None]:
majority_voting_df = sum_df.apply(lambda x: 1 if x > 0 else -1)
majority_voting_df.head()

Id
1   -1
2   -1
3   -1
4    1
5   -1
dtype: int64

In [None]:
majority_voting_df.to_csv(MAJORITY_VOTING_SUBMISSION_FILE, sep=',', header=["Prediction"], index=True)