<a href="https://www.kaggle.com/derekpisner/twitterdisasterclassification?scriptVersionId=89215140" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import string
import statistics
import nltk
import spacy
import matplotlib
import os
import string
import warnings
import numpy as np
import pandas as pd
from scipy import stats
from joblib import parallel_backend
from sklearn import linear_model, decomposition
from collections import OrderedDict
from operator import itemgetter
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif, VarianceThreshold, SelectFwe
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone, RegressorMixin
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer

try:
    from sklearn.utils._testing import ignore_warnings
except:
    from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
class ReduceVIF(BaseEstimator, TransformerMixin):

    def __init__(self, thresh=10.0):
        self.thresh = thresh

    def fit(self, X, y=None):
        self.X = X
        self.y = y
        return self

    def transform(self, X):
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        from statsmodels.stats.outliers_influence import \
            variance_inflation_factor
        dropped = True
        vif_cols = []
        while dropped:
            # Loop repeatedly until we find that all columns within our dataset
            # have a VIF value less than the threshold
            variables = X.columns
            dropped = False
            vif = []
            new_vif = 0
            for var in X.columns:
                new_vif = variance_inflation_factor(X[variables].values,
                                                    X.columns.get_loc(var))
                vif.append(new_vif)
                if np.isinf(new_vif):
                    break
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                vif_cols.append(X.columns.tolist()[maxloc])
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
        return X, vif_cols

def preprocess_x_y(X, nuisance_cols=[], nodrop_columns=[],
                   var_thr=0.80, remove_multi=True,
                   standardize=True,
                   std_dev=3, vif_thr=5, missingness_thr=0.50,
                   zero_thr=0.50):
    from colorama import Fore, Style

    # Replace all near-zero with zeros
    # Drop excessively sparse columns with >zero_thr zeros
    if zero_thr > 0:
        X = X.apply(lambda x: np.where(np.abs(x) < 0.000001, 0, x))
        X_tmp = X.T.loc[(X == 0).sum() < (float(zero_thr)) * X.shape[0]].T

        if len(nodrop_columns) > 0:
            X = pd.concat([X_tmp, X[[i for i in X.columns if i in
                                     nodrop_columns and i not in
                                     X_tmp.columns]]], axis=1)
        else:
            X = X_tmp
        del X_tmp

        if X.empty or len(X.columns) < 5:
            print(f"\n\n{Fore.RED}Empty feature-space (Zero Columns): "
                  f"{X}{Style.RESET_ALL}\n\n")
            return X

    # Remove columns with excessive missing values
    X = X.dropna(thresh=len(X) * (1 - missingness_thr), axis=1)
    if X.empty:
        print(f"\n\n{Fore.RED}Empty feature-space (missingness): "
              f"{X}{Style.RESET_ALL}\n\n")
        return X

    # Apply a simple imputer (note that this assumes extreme cases of
    # missingness have already been addressed). The SimpleImputer is better
    # for smaller datasets, whereas the IterativeImputer performs best on
    # larger sets.

    # from sklearn.experimental import enable_iterative_imputer
    # from sklearn.impute import IterativeImputer
    # imp = IterativeImputer(random_state=0, sample_posterior=True)
    # X = pd.DataFrame(imp.fit_transform(X, y), columns=X.columns)
    imp1 = SimpleImputer()
    X = pd.DataFrame(imp1.fit_transform(X.astype('float32')),
                     columns=X.columns)

    # Standardize X
    if standardize is True:
        scaler = StandardScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Remove low-variance columns
    sel = VarianceThreshold(threshold=(var_thr * (1 - var_thr)))
    sel.fit(X)
    if len(nodrop_columns) > 0:
        good_var_cols = X.columns[np.concatenate(
            [sel.get_support(indices=True), np.array([X.columns.get_loc(c)
                                                      for c in
                                                      nodrop_columns if
                                                      c in X])])]
    else:
        good_var_cols = X.columns[sel.get_support(indices=True)]

    low_var_cols = [i for i in X.columns if i not in list(good_var_cols)]
    if len(low_var_cols) > 0:
        print(f"Dropping {low_var_cols} for low variance...")
    X = X[good_var_cols]

    if X.empty:
        print(f"\n\n{Fore.RED}Empty feature-space (low-variance): "
              f"{X}{Style.RESET_ALL}\n\n")
        return X

    # Remove multicollinear columns
    if remove_multi is True:
        try:
            rvif = ReduceVIF(thresh=vif_thr)
            X = rvif.fit_transform(X)[0]
            if X.empty or len(X.columns) < 5:
                print(f"\n\n{Fore.RED}Empty feature-space "
                      f"(multicollinearity): "
                      f"{X}{Style.RESET_ALL}\n\n")
                return X
        except:
            print(f"\n\n{Fore.RED}Empty feature-space (multicollinearity): "
                  f"{X}{Style.RESET_ALL}\n\n")
            return X

    print(f"\nX: {X}\n")
    print(f"Features: {list(X.columns)}\n")
    return X


class Razors(object):
    """
    Razors is a callable refit option for `GridSearchCV` whose aim is to
    balance model complexity and cross-validated score in the spirit of the
    "one standard error" rule of Breiman et al. (1984), which showed that
    the tuning hyperparameter associated with the best performing model may be
    prone to overfit. To help mitigate this risk, we can instead instruct
    gridsearch to refit the highest performing 'parsimonious' model, as defined
    using simple statistical rules (e.g. standard error (`sigma`),
    percentile (`eta`), or significance level (`alpha`)) to compare
    distributions of model performance across folds. Importantly, this
    strategy assumes that the grid of multiple cross-validated models
    can be principly ordered from simplest to most complex with respect to some
    target hyperparameter of interest. To use the razors suite, supply
    the `simplify` function partial of the `Razors` class as a callable
    directly to the `refit` argument of `GridSearchCV`.

    Parameters
    ----------
    cv_results : dict of numpy(masked) ndarrays
        See attribute cv_results_ of `GridSearchCV`.
    scoring : str
        Refit scoring metric.
    param : str
        Parameter whose complexity will be optimized.
    rule : str
        Rule for balancing model complexity with performance.
        Options are 'se', 'percentile', and 'ranksum'. Default is 'se'.
    sigma : int
        Number of standard errors tolerance in the case that a standard error
        threshold is used to filter outlying scores across folds. Required if
        `rule`=='se'. Default is 1.
    eta : float
        Percentile tolerance in the case that a percentile threshold
        is used to filter outlier scores across folds. Required if
        `rule`=='percentile'. Default is 0.68.
    alpha : float
        An alpha significance level in the case that wilcoxon rank sum
        hypothesis testing is used to filter outlying scores across folds.
        Required if `rule`=='ranksum'. Default is 0.05.

    References
    ----------
    Breiman, Friedman, Olshen, and Stone. (1984) Classification and Regression
    Trees. Wadsworth.

    Notes
    -----
    Here, 'simplest' is defined by the complexity of the model as influenced by
    some user-defined target parameter (e.g. number of components, number of
    estimators, polynomial degree, cost, scale, number hidden units, weight
    decay, number of nearest neighbors, L1/L2 penalty, etc.).

    The callable API accordingly assumes that the `params` attribute of
    `cv_results_` 1) contains the indicated hyperparameter (`param`) of
    interest, and 2) contains a sequence of values (numeric, boolean, or
    categorical) that are ordered from least to most complex.
    """
    __slots__ = ('cv_results', 'param', 'param_complexity', 'scoring',
                 'rule', 'greater_is_better',
                 '_scoring_funcs', '_scoring_dict',
                 '_n_folds', '_splits', '_score_grid',
                 '_cv_means', '_sigma', '_eta', '_alpha')

    def __init__(
            self,
            cv_results_,
            param,
            scoring,
            rule,
            sigma=1,
            eta=0.95,
            alpha=0.01,
    ):
        import sklearn.metrics

        self.cv_results = cv_results_
        self.param = param
        self.scoring = scoring
        self.rule = rule
        self._scoring_funcs = [
            met
            for met in sklearn.metrics.__all__
            if (met.endswith("_score")) or (met.endswith("_error"))
        ]
        # Set _score metrics to True and _error metrics to False
        self._scoring_dict = dict(
            zip(
                self._scoring_funcs,
                [met.endswith("_score") for met in self._scoring_funcs],
            )
        )
        self.greater_is_better = self._check_scorer()
        self._n_folds = len(list(set([i.split('_')[0] for i in
                                     list(self.cv_results.keys()) if
                                     i.startswith('split')])))
        # Extract subgrid corresponding to the scoring metric of interest
        self._splits = [i for i in list(self.cv_results.keys()) if
                        i.endswith(f"test_{self.scoring}") and
                        i.startswith('split')]
        self._score_grid = np.vstack([self.cv_results[cv] for cv in
                                      self._splits]).T
        self._cv_means = np.array(np.nanmean(self._score_grid, axis=1))
        self._sigma = sigma
        self._eta = eta
        self._alpha = alpha

    def _check_scorer(self):
        """
        Check whether the target refit scorer is negated. If so, adjust
        greater_is_better accordingly.
        """

        if (
                self.scoring not in self._scoring_dict.keys()
                and f"{self.scoring}_score" not in self._scoring_dict.keys()
        ):
            if self.scoring.startswith("neg_"):
                self.greater_is_better = True
            else:
                raise NotImplementedError(f"Scoring metric {self.scoring} not "
                                          f"recognized.")
        else:
            self.greater_is_better = [
                value for key, value in self._scoring_dict.items() if
                self.scoring in key][0]
        return self.greater_is_better

    def _best_low_complexity(self):
        """
        Balance model complexity with cross-validated score.

        Return
        ------
        int
            Index of a model that has the lowest complexity but its test score
            is the highest on average across folds as compared to other models
            that are equally likely to occur.
        """

        # Check parameter(s) whose complexity we seek to restrict
        if not any(self.param in x for x in
                   self.cv_results["params"][0].keys()):
            raise KeyError(f"Parameter {self.param} not found in cv grid.")
        else:
            hyperparam = [
                i for i in self.cv_results["params"][0].keys() if
                i.endswith(self.param)][0]

        # Select low complexity threshold based on specified evaluation rule
        if self.rule == "se":
            if not self._sigma:
                raise ValueError(
                    "For `se` rule, the tolerance "
                    "(i.e. `_sigma`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_standard_error()
        elif self.rule == "percentile":
            if not self._eta:
                raise ValueError(
                    "For `percentile` rule, the tolerance "
                    "(i.e. `_eta`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_percentile()
        elif self.rule == "ranksum":
            if not self._alpha:
                raise ValueError(
                    "For `ranksum` rule, the alpha-level "
                    "(i.e. `_alpha`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_rank_sum_test()
        else:
            raise NotImplementedError(f"{self.rule} is not a valid "
                                      f"rule of RazorCV.")

        self.cv_results[f"param_{hyperparam}"].mask = np.where(
            (self._cv_means >= float(l_cutoff)) &
            (self._cv_means <= float(h_cutoff)),
            True, False)

        if np.sum(self.cv_results[f"param_{hyperparam}"].mask) == 0:
            print(f"\nLow: {l_cutoff}")
            print(f"High: {h_cutoff}")
            print(f"{self._cv_means}")
            print(f"hyperparam: {hyperparam}\n")
            raise ValueError("No valid grid columns remain within the "
                             "boundaries of the specified razor")

        highest_surviving_rank = np.nanmin(
            self.cv_results[f"rank_test_{self.scoring}"][
                self.cv_results[f"param_{hyperparam}"].mask])

        # print(f"Highest surviving rank: {highest_surviving_rank}\n")

        return np.flatnonzero(np.isin(
            self.cv_results[f"rank_test_{self.scoring}"],
            highest_surviving_rank))[0]

    def call_standard_error(self):
        """
        Returns the simplest model whose performance is within `sigma`
        standard errors of the average highest performing model.
        """

        # Estimate the standard error across folds for each column of the grid
        cv_se = np.array(np.nanstd(self._score_grid, axis=1) /
                         np.sqrt(self._n_folds))

        # Determine confidence interval
        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
            h_cutoff = self._cv_means[best_score_idx] + cv_se[best_score_idx]
            l_cutoff = self._cv_means[best_score_idx] - cv_se[best_score_idx]
        else:
            best_score_idx = np.nanargmin(self._cv_means)
            h_cutoff = self._cv_means[best_score_idx] - cv_se[best_score_idx]
            l_cutoff = self._cv_means[best_score_idx] + cv_se[best_score_idx]

        return l_cutoff, h_cutoff

    def call_rank_sum_test(self):
        """
        Returns the simplest model whose paired performance across folds is
        insignificantly different from the average highest performing,
        at a predefined `alpha` level of significance.
        """

        from scipy.stats import wilcoxon
        import itertools

        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
        else:
            best_score_idx = np.nanargmin(self._cv_means)

        # Perform signed Wilcoxon rank sum test for each pair combination of
        # columns against the best average score column
        tests = [pair for pair in list(itertools.combinations(range(
            self._score_grid.shape[0]), 2)) if best_score_idx in pair]

        p_dict = {}
        for i, test in enumerate(tests):
            p_dict[i] = wilcoxon(self._score_grid[test[0], :],
                                 self._score_grid[test[1], :])[1]

        # Sort and prune away significant tests
        p_dict = {k: v for k, v in sorted(p_dict.items(),
                                          key=lambda item: item[1]) if
                  v > self._alpha}

        # Flatten list of tuples, remove best score index, and take the
        # lowest and highest remaining bounds
        tests = [j for j in list(set(list(sum([tests[i] for i in
                                               list(p_dict.keys())],
                                              ())))) if j != best_score_idx]
        if self.greater_is_better:
            h_cutoff = self._cv_means[
                np.nanargmin(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
            l_cutoff = self._cv_means[
                np.nanargmax(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
        else:
            h_cutoff = self._cv_means[
                np.nanargmax(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
            l_cutoff = self._cv_means[
                np.nanargmin(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]

        return l_cutoff, h_cutoff


    def call_percentile(self):
        """
        Returns the simplest model whose performance is within the `eta`
        percentile of the average highest performing model.
        """

        # Estimate the indicated percentile, and its inverse, across folds for
        # each column of the grid
        perc_cutoff = np.nanpercentile(self._score_grid,
                                       [100 * self._eta,
                                        100 - 100 * self._eta], axis=1)

        # Determine bounds of the percentile interval
        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
            h_cutoff = perc_cutoff[0, best_score_idx]
            l_cutoff = perc_cutoff[1, best_score_idx]
        else:
            best_score_idx = np.nanargmin(self._cv_means)
            h_cutoff = perc_cutoff[0, best_score_idx]
            l_cutoff = perc_cutoff[1, best_score_idx]

        return l_cutoff, h_cutoff

    @staticmethod
    def simplify(param, scoring, rule='se', sigma=1, eta=0.68, alpha=0.01):
        """
        Callable to be run as `refit` argument of `GridsearchCV`.

        Parameters
        ----------
        param : str
            Parameter with the largest influence on model complexity.
        scoring : str
            Refit scoring metric.
        sigma : int
            Number of standard errors tolerance in the case that a standard
            error threshold is used to filter outlying scores across folds.
            Only applicable if `rule`=='se'. Default is 1.
        eta : float
            Acceptable percent tolerance in the case that a percentile
            threshold is used. Only applicable if `rule`=='percentile'.
            Default is 0.68.
        alpha : float
            Alpha-level to use for signed wilcoxon rank sum testing.
            Only applicable if `rule`=='ranksum'. Default is 0.01.
        """
        from functools import partial

        def razor_pass(
                cv_results_, param, scoring, rule, sigma, alpha, eta
        ):
            rcv = Razors(cv_results_, param, scoring, rule=rule,
                         sigma=sigma, alpha=alpha, eta=eta)
            return rcv._best_low_complexity()

        return partial(
            razor_pass,
            param=param,
            scoring=scoring,
            rule=rule,
            sigma=sigma,
            alpha=alpha,
            eta=eta,
        )

# Load data

In [3]:
working_dir = '/kaggle/input/nlp-getting-started/'
#working_dir = '/home/dpys/Documents/Kaggle_Competitions/nlp-getting-started/'
df_train = pd.read_csv(f"{working_dir}train.csv")
df_test = pd.read_csv(f"{working_dir}test.csv")
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


# Exploratory Data Analysis

In [4]:
df_train_pos = df_train[df_train.target == 1]
df_train_neg = df_train[df_train.target == 0]
print(f"No. of positive training examples = {len(df_train_pos)}")
print(f"No. of negative training examples = {len(df_train_neg)}")
train_keywords_unique = df_train.keyword.unique()
print(f"No. of unique keywords = {len(train_keywords_unique)}")
df_train_notnull_keywords = df_train[~df_train.keyword.isnull()]
print(f"No of train examples with keyword not null = {len(df_train_notnull_keywords)}")

No. of positive training examples = 3271
No. of negative training examples = 4342
No. of unique keywords = 222
No of train examples with keyword not null = 7552


# Feature Engineering
## NLP libraries and functions

In [5]:
import re
import nltk
import gensim
import sys
import spacy
import emoji
import warnings
import en_core_web_sm
from spacy import displacy
from spacy.tokenizer import Tokenizer
from tqdm import tqdm
import seaborn as sns
import string
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.util import compile_infix_regex

unicode = str

special = string.punctuation 
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer("english")
nlp = spacy.load('en_core_web_lg')

# Tweet symbols
# Retrieve the default token-matching regex pattern
re_token_match = spacy.tokenizer._get_regex_pattern(nlp.Defaults.token_match)
# Add #hashtag pattern
re_token_match = f"({re_token_match}|#\\w+)"
nlp.tokenizer.token_match = re.compile(re_token_match).match

# Punctuations I want to remove, including the empty token
puncts = ['\u200d','?', '....','..','...','','@','#', ',', '.', '"', ':', ')', '(', '-', '!', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '*', '+', '\\', 
    '•', '~', '£', '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', 
    '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
    '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 
    'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
    '¹', '≤', '‡', '√', '!','🅰','🅱']

EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

# Found a dictionary of common contractions and colloquial language
contraction_colloq_dict = {"btw": "by the way", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have"}

# Initializing the lemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

try:
    # nlp = spacy.load('en_core_web_sm')
    nlp = spacy.load('en_core_web_lg') 
    language_detector = LanguageDetector()
    nlp.add_pipe(language_detector)
except BaseException:
    pass
    
token_dict = {}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    
def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(df_all,train_len):
    return df_all.loc[:train_len-1], df_all.loc[train_len:].drop('target',axis=1)

def List_of_words(df): 
    words = [word for tweet in tqdm(df['text']) for word in tweet.split()]
    return words

def List_of_tweets(df):
    tweets = [tweet for tweet in tqdm(df['text']) ]
    return tweets

def mislabeled_tweets(df) : # function that returns mislabeled labeled tweets
    df = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
    df = df[df['target'] > 1]['target']
    return (df.index.tolist()) 

# Correct mislabeled tweets
def correcting_labels(df) : 
    df['target_relabeled'] = df['target'].copy() 
    df.loc[df['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
    df.loc[df['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
    df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
    df.loc[df['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
    df.loc[df['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
    df.loc[df['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
    df.loc[df['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
    df.loc[df['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
    df.loc[df['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
    df.loc[df['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
    df.loc[df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
    df.loc[df['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
    df.loc[df['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
    df.loc[df['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
    df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
    df.loc[df['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
    df.loc[df['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
    df.loc[df['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0
    df.drop('target', axis= 1, inplace=True) 
    df.columns = ['id', 'keyword', 'location', 'text', 'target']
    return df

def location_binging(df):
    df['location'].replace({'United States':'USA',
                            'New York':'USA',
                              "London":'UK',
                              "Los Angeles, CA":'USA',
                              "Washington, D.C.":'USA',
                              "California":'USA',
                              "Chicago, IL":'USA',
                              "Chicago":'USA',
                              "New York, NY":'USA',
                              "California, USA":'USA',
                              "FLorida":'USA',
                              "Nigeria":'Africa',
                              "Kenya":'Africa',
                              "Everywhere":'Worldwide',
                              "San Francisco":'USA',
                              "Florida":'USA',
                              "United Kingdom":'UK',
                              "Los Angeles":'USA',
                              "Toronto":'Canada',
                              "San Francisco, CA":'USA',
                              "NYC":'USA',
                              "Seattle":'USA',
                              "Earth":'Worldwide',
                              "Ireland":'UK',
                              "London, England":'UK',
                              "New York City":'USA',
                              "Texas":'USA',
                              "London, UK":'UK',
                              "Atlanta, GA":'USA',
                              "Mumbai":"India"},inplace=True)
    return df

def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def convert_emojis(text):
    UNICODE_EMO = emoji.EMOJI_UNICODE_ENGLISH
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

def clean_text(text):
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', str(text))
    text = re.sub(r'[^A-Za-z0-9\\n+^A-Za-z0-9\)\(\.\,\;\\\'\/\?\&\%\@\!\+\:\-]', ' ', text)
    if len(text.split("  ")) > 1000:
        text = " ".join(["".join(w.split(" ")) if len(w.split(' '))>1 else w for w in text.split("  ")])
    text = re.sub(r'\s', ' ', text)
    text = re.sub(r"([A-z])\- ([A-z])", r"\1\2", text)
    text = text.replace('\'','')
    text = text.replace('. .', '.')
    text = text.replace('\'','')
    text = re.sub(r"\s+"," ", text)
    
    # Remove Emails
    text = ''.join([re.sub('\S*@\S*\s?', '', sent) for sent in text])
    
    # Remove URL's
    text_nourl = re.sub(r'\w+:\/{2}[\d+\w-]+(\.[\d+\w-]+)*(?:(?:\/[^\s+/]*))*', '', text)
    url = re.compile(r'https?://\S+|www\.\S+')
    text_nourl = url.sub(r'',text_nourl)
    
    # Remove html
    html = re.compile(r'<.*?>')
    text = html.sub(r'',text_nourl)

    # Convert emoticons/emoji
    text = convert_emoticons(text)
    text = convert_emojis(text)

    # Remove any chunks of consecutive numbers
    number_strings = re.findall(r'\d+[ \t]\d+', text)
    ind_num_strings = []
    for j in number_strings:
        x = [int(i) for i in j.split()]
        ind_num_strings.append(x)
    
    flat_num_list = [item for sublist in ind_num_strings for item in sublist]
    
    for i in flat_num_list:
        j=re.sub(r'\d+','',str(i))
        text = text.replace(str(i),j)

    texter_filt = re.sub(r"\( ","", text)
    texter_filt = re.sub(r" \)","", texter_filt)
    texter_filt = re.sub(r"\(\)","", texter_filt)
    texter_filt = re.sub(r"\(\s+\)","", texter_filt)
    texter_filt = re.sub(r" \.",".", texter_filt)
    texter_filt = re.sub(r"\\.",".", texter_filt)
    texter_filt = re.sub(r" \,",",", texter_filt)
    texter_filt = re.sub(r"\\,",",", texter_filt)
    texter_filt = re.sub(r"\s+\s+"," ", texter_filt)    
    texter_filt = re.sub(' +', ' ', texter_filt)
    return texter_filt.lower()

def word_count(df_all) : 
    df_all['word_count'] = df_all['text'].apply(lambda x: len(str(x).split()))
    return df_all

def unique_word_count(df_all) : 
    df_all['unique_word_count'] = df_all['text'].apply(lambda x: len(set(str(x).split())))  
    return df_all

def stop_word_count(df_all) : 
    df_all['stop_word_count'] = df_all['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
    return df_all

def url_count(df_all) : 
    df_all['url_count'] = df_all['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
    return df_all

def mean_word_length(df_all) : 
    df_all['mean_word_length'] = df_all['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    return df_all

def char_count(df_all) : 
    df_all['char_count'] = df_all['text'].apply(lambda x: len(str(x)))
    return df_all

def punctuation_count(df_all) : 
    df_all['punctuation_count'] = df_all['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    return df_all

def hashtag_count(df_all) : 
    df_all['hashtag_count'] = df_all['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
    return df_all

def mention_count(df_all) : 
    df_all['mention_count'] = df_all['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
    return df_all

def stemming(text) : 
    stemmer = nltk.stem.PorterStemmer()
    return(" ".join(stemmer.stem(word) for word in text.split()))

def flatten(l):
    """
    Flatten list of lists.
    """
    import collections

    for el in l:
        if isinstance(
                el, collections.Iterable) and not isinstance(
                el, (str, bytes)):
            for ell in flatten(el):
                yield ell
        else:
            yield el

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        r'''[-~]'''
    ]
)

def tokenize(sentence, infixes):
    all_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = compile_infix_regex(infixes)
    def customize_tokenizer(nlp):
        # Adds support to use `-` as the delimiter for tokenization
        return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer,
                         token_match=None
                        )

    nlp.tokenizer = customize_tokenizer(nlp)
    return [token.text for token in sentence if not token.is_stop]    

def replace_from_dict(x, dic):
    replaced_counter = 0
    for item in dic.items():
        for i, e in enumerate(x):
            if e == item[0]:
                replaced_counter += 1
                del x[i]
                for ix, token in enumerate(item[1].split()):
                    x.insert(i + ix,token)
    return x

def lemmatize_list(x):
    x = " ".join(x)
    # Returning a list again
    return [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]

def remove_from_list(x, stuff_to_remove) -> list:
    for item in stuff_to_remove:
        # Making sure to iterate through the entire token
        for i,token in enumerate(x):
            if item == token:
                del x[i]
    return x

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def sent_to_topics(text):

    # text = ' '.join(list(train['text'].values))
    texter = nlp(text)
    sentences = list(texter.sents)

    def sent_to_words(sentences):
        for sentence in sentences:
            yield([word for word in gensim.utils.simple_preprocess(' '.join(tokenize(sentence, infixes)), deacc=True)])
    
    # Tokenize
    topics = list(flatten(list(sent_to_words(sentences))))
    
    # Map contractions
    topics = replace_from_dict(topics, contraction_colloq_dict)

    # Lemmatize
    topics = lemmatize_list(topics)
    
    # Remove punctuation
    topics = remove_from_list(topics, puncts)
    
    return topics

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## NLP Munging

In [None]:
# mislabeled_tweets(df_train)

In [None]:
# df_train = correcting_labels(df_train)
# df_all = concat_df(df_train, df_test)

In [None]:
# df_all = stop_word_count(punctuation_count(hashtag_count(url_count(location_binging(df_all))))) 

# df_all['text'] = df_all['text'].apply(lambda x : clean_text(x))
# df_all['text'] = df_all['text'].apply(lambda x : sent_to_topics(x))
# df_all['text'] = df_all['text'].apply(lambda x : ' '.join(x))

# df_all = mention_count(char_count(mean_word_length(unique_word_count(word_count(df_all)))))
# df_all.to_csv('df_nlp.csv', index=False)

In [6]:
# df_all = pd.read_csv(f"../input/twitterdisasterclassoutput/df_nlp.csv")
# train, test = divide_df(df_all, df_train.shape[0])

## Feature embedding

In [None]:
# train_tweet_vectors = None
# test_tweet_vectors = None
# with nlp.disable_pipes():
#     train_tweet_vectors = np.array([nlp(str(row.text)).vector for id, row in pd.DataFrame(train[['id', 'text']]).reset_index(drop=True).iterrows()])
#     test_tweet_vectors = np.array([nlp(str(row.text)).vector for id, row in pd.DataFrame(test[['id', 'text']]).reset_index(drop=True).iterrows()])

# Prepare target label vectors

In [7]:
# train_targets = df_train["target"].values
# #print(train_tweet_vectors.mean(axis=0).shape, train_tweet_vectors.std(axis=0).shape)

# Preprocess

In [None]:
# other_columns = ['url_count', 'hashtag_count', 'punctuation_count', 'stop_word_count', 'word_count', 'unique_word_count', 'mean_word_length', 'char_count', 'mention_count']
# train_other = train[other_columns].reset_index(drop=True)
# X_train=pd.concat([pd.DataFrame(train_tweet_vectors), train_other[other_columns]], axis=1)
# test_other = test[other_columns].reset_index(drop=True)
# X_test=pd.concat([pd.DataFrame(test_tweet_vectors), test_other], axis=1)
# y_train=train_targets
# X_all = concat_df(X_train, X_test)

# preprocess = FunctionTransformer(preprocess_x_y)
# cleaned = preprocess.fit_transform(X=X_all)

# X_train_clean = cleaned.head(X_train.shape[0])
# X_test_clean = cleaned.tail(X_test.shape[0])

# X_train_clean = X_train_clean.reset_index(drop=True)
# X_test_clean = X_test_clean.reset_index(drop=True)

# X_train_clean.to_csv('/kaggle/working/df_train_preprocessed.csv', index=False)
# X_test_clean.to_csv('/kaggle/working/df_test_preprocessed.csv', index=False)

In [41]:
seed=42
X_train_clean = pd.read_csv('../input/twitterdisasterclassoutput/df_train_preprocessed.csv')
X_test_clean = pd.read_csv('../input/twitterdisasterclassoutput/df_test_preprocessed.csv')

X_train, X_test, y_train, y_test = train_test_split(X_train_clean, train_targets, random_state=seed)

# Tuning

In [42]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = [
#             'GBC',
            'SVC',
            'EN'
         ]
                
estimators = [
#         GradientBoostingClassifier(max_features='sqrt', loss='deviance', criterion='friedman_mse', random_state=seed),
        SVC(random_state=seed, kernel='rbf', class_weight='balanced'), 
        LogisticRegression(penalty='elasticnet', fit_intercept=True,
                                                  solver='saga',
                                                  class_weight='auto',
                                                  random_state=seed,
                                                  warm_start=True)
]
    
params = {
#             models[0]:{'max_depth': [2, 3, 4], 'min_samples_leaf': [0.1, 0.2, 0.3], 'min_samples_split': [0.1, 0.2, 0.3], 'n_estimators': [500, 1000, 2000],  'learning_rate':[0.01, 0.05, 0.1, 0.15], 'subsample': [0.7, 0.8, 0.9]},
            models[0]: {'C':[1, 0.5, 0.1], 'tol': [0.01, 0.001]},
            models[1]: {'l1_ratio': [0.85, 0.9, 0.95], 'C':[1, 0.1], 'tol': [0.01, 0.001]}
         }
    
def get_pca_range(X):
    from sklearn.decomposition import PCA
    pca = PCA(n_components = 0.95)
    pca.fit(X)
    min_comps = pca.components_.shape[0]

    pca = PCA(n_components = 0.99)
    pca.fit(X)
    max_comps = pca.components_.shape[0]
    return np.arange(start=np.round(min_comps, -1), stop=np.round(max_comps, -1), step=10) 

n_comps_range = list(get_pca_range(X_train_clean))
n_comps_range

[200, 210, 220, 230, 240, 250]

# Model Selection with Grid Search 

## Take subset of data to avoid overcomputation

In [43]:
X_train = X_train.reset_index(drop=True)
y_train = pd.DataFrame(y_train).reset_index(drop=True)

X_train = X_train.head(500)
y_train = y_train.head(500)

X_test = X_test.reset_index(drop=True)
y_test = pd.DataFrame(y_test).reset_index(drop=True)

X_test = X_test.head(500)
y_test = y_test.head(500)

In [None]:
model_factory = {}

inner_scoring = "f1"

feature_select = FeatureUnion([('pca', decomposition.PCA(random_state=seed)), 
                               ("anova", SelectFwe(f_classif, alpha=0.01))])
            
for name, estimator in zip(models, estimators):
    print(name)
    model_factory[name] = {}
    
    # Pipeline feature selection (PCA) with model fitting
    pipe = Pipeline(
        [
             ('feature_select', feature_select),
             (name, estimator),
        ]
    )
    model_params = {}
    for hyperparam in params[name].keys():
        model_params[f"{name}__{hyperparam}"] = params[name][hyperparam]
    model_params[f"feature_select__pca__n_components"] = n_comps_range
    pipe_grid_cv = GridSearchCV(pipe, model_params, scoring=[inner_scoring], 
                       refit=Razors.simplify(param='feature_select__pca__n_components', 
                                             scoring=inner_scoring, rule="se", sigma=1), 
                       cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=seed), n_jobs=-1)
    pipe_grid_cv.fit(X_train, y_train.values.ravel())
    model_factory[name]['oos_score'] = cross_val_score(pipe_grid_cv, X_test, y_test.values.ravel(), 
                                                       scoring='accuracy', 
                                                       cv=StratifiedKFold(n_splits=10, 
                                                                          shuffle=True, 
                                                                          random_state=seed + 1))
    model_factory[name]['best_params'] = pipe_grid_cv.best_params_
    model_factory[name]['best_estimator'] = pipe_grid_cv.best_estimator_
    
best_estimator = max(model_factory,
                     key=lambda v: model_factory[v]['oos_score'])

best_estimator.fit(X_train_clean, train_targets)

final_est = best_estimator

## Fit best model to full training data

In [60]:
from sklearn import ensemble
meta_clf = GaussianNB()

# base_models = [('GBC', model_factory['GBC']['best_estimator']['GBC']), ('SVC', model_factory['SVC']['best_estimator']['SVC']), ('EN', model_factory['EN']['best_estimator']['EN'])]

base_models = [('SVC', model_factory['SVC']['best_estimator']['SVC']), ('EN', model_factory['EN']['best_estimator']['EN'])]

outer_stacked = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed + 1)

ec = ensemble.StackingClassifier(estimators=base_models, final_estimator=meta_clf, passthrough=False, cv=outer_stacked)

# Pipeline feature selection (PCA) with model fitting
pipe = Pipeline(
    [
         ('feature_select', model_factory['SVC']['best_estimator']['feature_select']),
         ('vc', ec),
    ]
)

pipe.fit(X_train_clean, train_targets)

final_est = pipe

# Submission

In [None]:
y_test_pred = final_est.predict(X_test_clean)
df_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
df_submission["target"] = y_test_pred
df_submission.to_csv('/kaggle/working/submission.csv',index=False)
df_submission