### Naive Bayes as Baseline

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, hamming_loss, f1_score, multilabel_confusion_matrix

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [4]:
cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', 
                     lowercase=True, stop_words='english')

frames = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]

In [19]:
def run_nb(verbose = False):
    f1_macro, f1_weighted, f1_micro,  hamming, stracc, multiacc = [[] for i in range(6)]
    for fold in range(5):
        train = pd.read_csv('dataset/'+str(fold)+'/train.tsv', sep='\t', header=None)
        train.columns = ["ImageID", "Headline", "text_b", "1", "2", "3", "4", "5", "6", "7", "8","9"]
    #     train.head()
        train['Headline'] = train['Headline'].map(lambda h : clean_text(h))
        test = pd.read_csv('dataset/'+str(fold)+'/test_labeled.tsv', sep='\t', header=None)
        test.columns = ["ImageID", "Headline", "text_b", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
        test['Headline'] = test['Headline'].map(lambda h : clean_text(h))
        test.head()
        X_train = train['Headline']
        X_test = test['Headline']
        X_train_cv = cv.fit_transform(X_train)
        X_test_cv = cv.transform(X_test)
        word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
        top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)
    #     word_freq_df.head()
        top_words_df.head()

        results = pd.DataFrame(columns = frames)
        preds_df = pd.DataFrame(columns = frames)
        for f in frames:
            naive_bayes = MultinomialNB()
            y_train = train[f]
            naive_bayes.fit(X_train_cv, y_train)
            predictions = naive_bayes.predict(X_test_cv)
            y_test = test[f]
    #         print("fold: ", fold, "f ", f)
    #         print(y_test)
    #         print(predictions)
            results[f] = [accuracy_score(y_test, predictions),
                         precision_score(y_test, predictions),
                         recall_score(y_test, predictions),
                         f1_score(y_test, predictions, average="macro"),
                         f1_score(y_test, predictions, average="micro")]
            preds_df[f] = predictions

        results["Metric"] = pd.Series(["Accuracy", "Precision", "Recall", "F1_Macro", "F1_Micro"])
        results.set_index("Metric", inplace=True)
    #     results['Average'] = results.mean(axis=1)
    #     multilabel_confusion_matrix(test.iloc[:,3:], preds_df)

        # print scores
        rd = 3
        f1_macro.append(f1_score(test.iloc[:,3:], preds_df, average='macro').round(rd))
        f1_weighted.append(f1_score(test.iloc[:,3:], preds_df, average='weighted').round(rd))
        f1_micro.append(f1_score(test.iloc[:,3:], preds_df, average='micro').round(rd))
        hamming.append(np.round(hamming_loss(test.iloc[:,3:], preds_df),rd))
        stracc.append(np.round(np.sum(np.equal(test.iloc[:,3:], preds_df).all(1))/preds_df.shape[0],rd))
#         print("F1-Macro: ", )
#         print("F1-Weighted: ", )
#         print("F1-Micro: ", )
#         print("Hamming loss: ", )
#         print("Exact match: ", )
        multiple_frame_articles_bool = np.sum(test.iloc[:,3:], axis=1) > 1.0
        equality_all_frames = np.equal(test.iloc[:,3:], preds_df)
        results_multiple = equality_all_frames.loc[multiple_frame_articles_bool]
        number_multiple = results_multiple.shape[0]
        match_multiple = np.sum(results_multiple.all(1))/number_multiple
        
        print("Number of articles with multiple frames: ", number_multiple)
        multiacc.append(np.round(match_multiple,rd))
#         print("Exact match among those: ", )
    return np.array([f1_macro, f1_weighted, f1_micro,  hamming, stracc, multiacc])

### Test

In [20]:
r = run_nb()

Number of articles with multiple frames:  59


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Number of articles with multiple frames:  64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Number of articles with multiple frames:  70


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Number of articles with multiple frames:  66
Number of articles with multiple frames:  60


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [26]:
pd.DataFrame(r.T, columns=["f1_macro", "f1_weighted", "f1_micro", 
                                 "hamming", "stracc", "multiacc"]).mean(axis=0).round(3)

f1_macro       0.537
f1_weighted    0.667
f1_micro       0.698
hamming        0.073
stracc         0.502
multiacc       0.291
dtype: float64