# Ensembles - Soft and Hard voting


## Best scores obtained with single models:

### LB
```python
n_words = 5000
top_n_words = [w for w, c in V.most_common(n_words)]
vect_top_n = TfidfVectorizer(vocabulary=top_n_words)
m = LogisticRegression()
m, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m, f'logistic_over_{n_words}_top_words_tfidf')

```

```
Train accuracy: 0.870
Val accuracy  : 0.795 (CV)
Leaderboard   : 0.78976 (LB)
```

### CV

```python
n_words = 5000
top_n_words = [w for w, c in V.most_common(n_words)]
vect_top_n = CountVectorizer(vocabulary=top_n_words)
m = LogisticRegression()
m, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m, f'logistic_over_{n_words}_top_words')

```

```
Train accuracy: 0.921
Val accuracy  : 0.776
Leaderboard   : 0.79190 (LB)
```


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords

def flatten(t):
    return [item for sublist in t for item in sublist]

def load_dfs(test_size=0.1, shuffle=False, verbose=True):
    df = pd.read_csv("data/train.csv")
    df = df.drop(['keyword', 'location'], axis=1)
    
    df_test = pd.read_csv("data/test.csv")
    df_test = df_test.drop(['keyword', 'location'], axis=1)
    
    df_sub = pd.read_csv("data/sample_submission.csv")
    
    df_train, df_val = train_test_split(df, test_size=test_size, shuffle=shuffle)

    if verbose:
        print(f"train shape: {df_train.shape}")
        print(f"val shape  : {df_val.shape}")
        print(f"test shape : {df_test.shape}")
    return df_train, df_val, df_test, df_sub


def get_vocab(df, drop_stopwords=True, keep_only_alpha=True):
    l = flatten(df['text'].str.lower().str.split().tolist())
    if drop_stopwords:
        st = stopwords.words('english')
        l = [w for w in l if w not in st and (not keep_only_alpha or w.isalpha())]
    return Counter(l)

In [2]:
df_train, df_val, df_test, df_sub = load_dfs()

train shape: (6851, 3)
val shape  : (762, 3)
test shape : (3263, 2)


In [3]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import warnings; warnings.simplefilter('ignore')

def run(model, df_train, df_val):
    model.fit(df_train['text'], df_train['target'])
    y_pred_train = model.predict(df_train['text'])
    y_pred_val = model.predict(df_val['text'])
    
    train_acc = accuracy_score(df_train['target'], y_pred_train)
    val_acc = accuracy_score(df_val['target'], y_pred_val)
    return model, train_acc, val_acc


def run_models(models, df_train, df_val):
    res = []
    trained_models = []
    for name, model in models:
        model, train_acc, val_acc = run(model, df_train, df_val)
        res.append((name, train_acc, val_acc))
        trained_models.append((name, model))

    df_res = pd.DataFrame(res, columns=['model', 'train acc', 'val acc']).sort_values("val acc", ascending=False)
    display(df_res)
    return df_res, trained_models

In [14]:
models = [
    ('naive-bayes', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', MultinomialNB())])),
    ('naive-bayes-2', Pipeline([('vect', CountVectorizer(max_features=3000)), ('model', MultinomialNB())])),
    ('lr-1000', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', LogisticRegression(max_iter=200))])),
    ('lr-idf-1000', Pipeline([('vect', TfidfVectorizer(max_features=1000)), ('model', LogisticRegression(max_iter=200))])),
    ('lr-id-2000', Pipeline([('vect', TfidfVectorizer(max_features=2000)), ('model', LogisticRegression(max_iter=200))])),
    ('svm', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', SVC( probability=True))])),
    ]
df_res, trained_models = run_models(models, df_train, df_val)

Unnamed: 0,model,train acc,val acc
3,lr-idf-1000,0.834185,0.804462
1,naive-bayes-2,0.838856,0.795276
4,lr-id-2000,0.853598,0.795276
5,svm,0.905269,0.786089
2,lr-1000,0.849803,0.784777
0,naive-bayes,0.800905,0.782152


# Manual average

In [15]:
# Simple average
y_proba_disaster = np.zeros(len(df_val))
for name, model in trained_models:
    one_proba = model.predict_proba(df_val['text'])[:, 1]
    y_proba_disaster += one_proba

y_proba_disaster /= len(trained_models)

In [16]:
y_pred_avg = (y_proba_disaster > 0.5).astype(int)
accuracy_score(df_val['target'], y_pred_avg)

0.8083989501312336

# Hard and soft voting

In [18]:
ensembles = [('hard', VotingClassifier(models)), 
             ('soft', VotingClassifier(models, voting='soft'))
            ]
run_models(ensembles, df_train, df_val);

Unnamed: 0,model,train acc,val acc
1,soft,0.850387,0.808399
0,hard,0.85316,0.805774


# Weighted average

In [21]:
# Simple average
y_proba_disaster = np.zeros(len(df_val))
weights = [1, 2, 1, 2, 1, 2]

for i, (name, model) in enumerate(trained_models):
    one_proba = model.predict_proba(df_val['text'])[:, 1]
    y_proba_disaster += weights[i] * one_proba

y_proba_disaster /= np.sum(weights)
y_pred_avg = (y_proba_disaster > 0.5).astype(int)
accuracy_score(df_val['target'], y_pred_avg)

0.8123359580052494

In [22]:
run_models([('soft', VotingClassifier(models, voting='soft', weights=weights))], df_train, df_val);

Unnamed: 0,model,train acc,val acc
0,soft,0.85462,0.812336


# Manual search of best validation score

In [25]:
models2 = [
    ('nb', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', MultinomialNB())])),
    ('nb2', Pipeline([('vect', CountVectorizer(max_features=3000)), ('model', MultinomialNB())])),
    ('nb3', Pipeline([('vect', TfidfVectorizer(max_features=3000)), ('model', MultinomialNB())])),
    ('svm', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', SVC( probability=True))])),
    ('svm-2', Pipeline([('vect', CountVectorizer(max_features=1500)), ('model', SVC( probability=True))])),
    ('lr-1000', Pipeline([('vect', CountVectorizer(max_features=1000)), ('model', LogisticRegression(max_iter=200))])),
    ('lr-3000', Pipeline([('vect', CountVectorizer(max_features=3000)), ('model', LogisticRegression(max_iter=200))])),
    ('lr-idf-1000', Pipeline([('vect', TfidfVectorizer(max_features=1000)), ('model', LogisticRegression(max_iter=200))])),
    ('lr-id-2000', Pipeline([('vect', TfidfVectorizer(max_features=2000)), ('model', LogisticRegression(max_iter=200))])),
    ('catb', Pipeline([('vect', TfidfVectorizer(max_features=1000)), ('model', CatBoostClassifier(verbose=0))])),
    ]


ensembles = [ 
             ('w1', VotingClassifier(models2, voting='soft',
                                      weights=[1, 1, 1, 1, 2, 1, 1, 1, 1, 1])),
             ('w2', VotingClassifier(models2, voting='soft',
                                      weights=[1, 1, 1, 1, 2, 2, 1, 1, 1, 1])),
             ('w3', VotingClassifier(models2, voting='soft',
                                      weights=[1, 1.5, 1, 1, 1.5, 1.5, 1, 1, 1, 1]))
    
            ]
res, trained_models = run_models(ensembles, df_train, df_val);

Unnamed: 0,model,train acc,val acc
0,w1,0.875347,0.82021
2,w3,0.869946,0.818898
1,w2,0.875931,0.817585


# Submission

In [26]:
final_model = trained_models[0][1]
y_pred_test = final_model.predict(df_test['text'])
df_test['target'] = y_pred_test
df_test[['id', 'target']].to_csv(f"ensemble.csv", index=False)
df_test.head()

Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
