In [8]:
from __future__ import annotations

import string
from ast import literal_eval
import sys; sys.path.append("../utils")
import _confusion_matrix

from IPython.display import display as ipy_display
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
import enchant

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

%config InlineBackend.figure_format = 'svg'

In [9]:
def encode_labels(df: pd.DataFrame, y_column: str) -> tuple[pd.DataFrame, dict]:
    """Encode labels to be used by the classifier."""
    enc = LabelEncoder()
    enc.fit(df[y_column])
    encoded_labels = dict(zip(enc.classes_, enc.transform(enc.classes_))) # will use that in the cm later
    df['label'] = enc.transform(df['label'])
    return df, encoded_labels

In [10]:
RAW_DATA_FILES = "../../data/reddit_ell_eng_raw_pos_punct{}.csv"
RAW_DATA_MAXLEN = ['none', '300', '200']

In [11]:
class MisspellingVectorizer(TfidfVectorizer):
    """A custom vectorizer that only considers misspelled words."""

    def get_misspellings(self, X):
        spellcheck = enchant.Dict('en_UK')
        X_miss = [[word for word in x.split()
                  if (word.isalpha())
                  and (not spellcheck.check(word))
                  and (not spellcheck.check(word.capitalize()))]  # tokes are lowercased -> capitalize to check
                 for x in X]
        print(X_miss, end="\n\n")
        X_miss = [" ".join(text) for text in X_miss]
        return X_miss

    def transform(self, X, y=None):
        X = self.get_misspellings(X)
        return super(MisspellingVectorizer, self).transform(X)

    def fit(self, X, y=None):
        X = self.get_misspellings(X)
        return super(MisspellingVectorizer, self).fit(X, y)

In [12]:
models_df = pd.DataFrame({
    'model': [
        'LSVM', 
        'LogReg'
    ],
    'base': [
        SGDClassifier(loss='hinge'),
        SGDClassifier(loss='log_loss'), # 'log' is deprecated in 1.1
    ],
    'char_params': [
        {
            'cxf__chars__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), 
                                        (1, 6), (1, 7), (1, 8), (1, 9), (1, 10)],
        },
        {
            'cxf__chars__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), 
                                        (1, 6), (1, 7), (1, 8), (1, 9), (1, 10)],
        },
    ],
    'misspell_params': [
        {
            'cxf__misspell__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
                                           (1, 6), (1, 7), (1, 8), (1, 9), (1, 10)],
        },
        {
            'cxf__misspell__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
                                           (1, 6), (1, 7), (1, 8), (1, 9), (1, 10)],
        },
    ],
})

In [13]:
all_results = pd.DataFrame()

In [14]:
for maxlen in RAW_DATA_MAXLEN:
    df = pd.read_csv(RAW_DATA_FILES.format(f"_{maxlen}" if maxlen != 'none' else ''),
                                           encoding='utf-8')

    df, labels = encode_labels(df, 'label')

    # punctuation column names are the marks themselves
    punct_cols = [col for col in df.columns if col in string.punctuation]

    X = df[['text', *punct_cols]]
    y = df['label']

    gss = GroupShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    train_idx, test_idx = next(gss.split(X, y, groups=df['user_id']))
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, end="\n\n")


    char_vect = TfidfVectorizer(analyzer='char', ngram_range=(1, 1), binary=True)
    mspl_vect = MisspellingVectorizer(analyzer='char', ngram_range=(1, 1), binary=True)
    scaler = StandardScaler(with_mean=False)

    for i, row in models_df.iterrows():
        model = row['model']
        base = row['base']
        char_params = row['char_params']
        misspell_params = row['misspell_params']
        char_ngram_range = char_params['cxf__chars__ngram_range']
        misspell_ngram_range = misspell_params['cxf__misspell__ngram_range']

        cxf = ColumnTransformer([
            ('chars', char_vect, 'text'),
            ('misspell', mspl_vect, 'text'),
            ('punct', scaler, punct_cols),
        ], remainder='drop')

        pipe = Pipeline([
            ('cxf', cxf),
            ('clf', base),
        ])

        print(f"{model:=^80}")
        print(f"With {maxlen=}".center(80, "-"))
        gs_grid = GridSearchCV(pipe, {**char_params, **misspell_params},
                               cv=10, scoring='f1_macro', n_jobs=-1, verbose=5)
        gs_grid.fit(X_train, y_train)
        print("Best hyperparameters:", gs_grid.best_params_, sep="\n", end="\n\n")
        y_pred = gs_grid.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        print(classification_report(y_test, y_pred), end="\n\n")
        cm = confusion_matrix(y_test, y_pred)
        _confusion_matrix.plot_cm(cm, pipe=pipe, report=report, labels=labels, colorbar=False, export=False, comment=f'1-10_grams_punct_misspell_{maxlen}')
        gs_results = pd.DataFrame(gs_grid.cv_results_)
        gs_results['model'] = model
        gs_results['maxlen'] = maxlen
        all_results = pd.concat([all_results, gs_results])

        print(f"{model} done", end="\n\n")

(7895, 32) (2029, 32) (7895,) (2029,)

-------------------------------With maxlen='none'-------------------------------
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best hyperparameters:
{'cxf__chars__ngram_range': (1, 6), 'cxf__misspell__ngram_range': (1, 9)}

[['tbh', 'unfuck', 'Ehm'], ['befor', 'Tsipras'], [], [], ['Erdogan'], ['Tsipras'], ['reddit'], ['Germoneys'], ['Romania', 'Romania'], ['Haris', 'Theoharis', 'ASPIS', 'offense', 'Potami', 'Bobolas'], ['miza', 'PASOK'], ['eurozone'], ['favor'], ['favor', 'FYR', 'antieuropean'], [], [], ['Downvote'], ['favor'], ['FYR', 'favor', 'suppoort'], [], ['ayy', 'electic'], [], ['Uk'], ['MPs', 'Panagiotis', 'Lafazanis', 'Valavani', 'Dimitris', 'Stratoulis', 'Valavani'], [], [], ['Antonis', 'Samaras', 'intimidanting', 'extorsions'], ['thats', 'guaranting'], [], [], [], ['pHD'], [], ['Dmitry', 'Rybolovlev', 'Lefkada'], ['neighboring'], ['HUNDRENDS', 'Pirelli', 'Sindos'], ['Althought', 'Georgiadis', 'prefered'], ['JW'], [], 

In [None]:
# sort values in params column
all_results['params'] = all_results['params'].apply(lambda x: str(x))
all_results = all_results.sort_values(by='params')
all_results

In [None]:
all_results = all_results.reset_index(drop=True)
all_results.to_csv('all_results_lsvm_logreg_ngrams_1-10_grams_punct_misspell.csv', index=False, encoding='utf-8')

In [None]:
with pd.option_context('display.max_rows', None):
    all_res = all_results[['model', 'maxlen', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
    all_res['params'] = all_res['params'].apply(lambda x: literal_eval(x))
    all_res['n_grams'] = all_res['params'].apply(lambda x: x['cxf__chars__ngram_range']).astype(str)
    all_res['n_grams'] = all_res['n_grams'].apply(lambda x: literal_eval(x)) # convert n_grams to tuple
    all_res = all_res.rename(columns={'mean_test_score': 'F1-score', 'std_test_score': 'std'})
    all_res = all_res.sort_values(by=['model', 'maxlen', 'n_grams'])
    all_res = all_res.groupby(['model', 'maxlen', 'n_grams']).agg({'F1-score': 'mean', 'std': 'mean'})
    all_res = all_res.sort_index(level=2, key=lambda x: x.map(lambda y: y[1])) # sort n_grams naturally
    all_res = all_res.unstack('model')  # move model to columns and group by model
    all_res = all_res.swaplevel(axis=1).sort_index(axis=1)  # swap F1-score and std columns with model

    ipy_display(all_res)

# all_res.to_latex('all_results_lsvm_logreg_1-10_grams_punct_misspell.tex', encoding='utf-8')

In [None]:
import re
from matplotlib.pyplot import Line2D

markers = ['o', 's', 'v', 'p', 'x', 'd', 'h', '8', 'P', '*']
colors = ['#404080', '#a52040', '#7d7dfa', '#d7d7d7', '#ff7f0e',
          '#2ca02c', '#1f77b4', '#9467bd', '#8c564b', '#e377c2']


fig, ax = plt.subplots(figsize=(8, 5))

for i, (model, group) in enumerate(all_res.groupby(level=0, axis=1)):
    group = group.droplevel(0, axis=1)  # drop F1-score and std
    group = group.sort_index(level=1, key=lambda x: x.map(lambda y: y[1]))  # sort n_grams naturally
    group = group.reset_index()
    group['n_grams'] = group['n_grams'].astype(str)
    group = group.set_index('n_grams')
    group.plot(y='F1-score', ax=ax, marker=markers[i], color=colors[i], label=model)
    # for x, y, yerr in zip(group.index, group['F1-score'], group['std']):
    #     ax.errorbar(x, y, yerr=yerr, fmt='none', color=colors[i], alpha=0.5, capsize=3, capthick=1, zorder=1)
    ax.fill_between(group.index, group['F1-score'] - group['std'], group['F1-score'] + group['std'],
                    color=colors[i], alpha=0.1, zorder=1)

ax.set_title('F1-score for different $n$-grams and models (including misspelled words)', pad=17)
ax.set_xlabel('$n$-gram range')
ax.set_ylabel('F1-score')
ax.set_ylim(0.55, 0.85)
ax.set_xticks(range(0, 10))
ax.set_xticklabels([f"(1, {i})" for i in range(1, 11)])
ax.grid(alpha=0.25, zorder=0)
ax.legend(loc='lower right')
plt.show();