In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')

In [1]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import re
import ftfy
import scrubadub
# import string
from tabulate import tabulate
import demoji
# import emoji
from nltk.corpus import stopwords
import contractions
from textblob import TextBlob


stop = stopwords.words('english')
emailDetector = scrubadub.Scrubber(
    detector_list=[scrubadub.detectors.EmailDetector])
wnl = WordNetLemmatizer()

label_codes = {'No': 0, 'Yes': 1}
t_handle_regex = r'(^|[^@\w])@(\w{1,15})\b'
t_hashtag_regex = r"#(\w+)"
t_url_regex = r"https?://\S+|www\.\S+"
t_markup_regex = r"<(\"[^\"]*\"|'[^']*'|[^'\">])*>"
t_handle_placeholder = ' {{HANDLE}}'
t_hashtag_placeholder = ' {{HASHTAG}}'
t_url_placeholder = '{{URL}}'
t_markup_placeholder = '{{MARKUP}}'
emoji_placeholder = '{{EMOJI}}'
# domain specific stopwords.
stop.extend(['panasonic'])

# table = str.maketrans("", "")


def read_file(path):
    df = pd.read_csv(path)
    df = df[['text', 'Complaint']]
    return df

def penn_to_wn(tag):
    def is_noun(tag):
        return tag in ['NN', 'NNS', 'NNP', 'NNPS']

    def is_verb(tag):
        return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    def is_adverb(tag):
        return tag in ['RB', 'RBR', 'RBS']

    def is_adjective(tag):
        return tag in ['JJ', 'JJR', 'JJS']

    # Pos tags to wn tags
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


def to_lower_case(text):
    return text.lower()


def fix_unicode(text):
    return ftfy.fix_text(text)


def replace_email(text):
    return emailDetector.clean(text)


def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in (stop)])


def convert_emoji_to_text(text):
    return text


def replace_user_name(text):
    return re.sub(t_handle_regex, t_handle_placeholder, text)


def replace_hashtags(text):
    return re.sub(t_hashtag_regex, t_hashtag_placeholder, text)


def replace_url(text):
    return re.sub(t_url_regex, t_url_placeholder, text)


def replace_markup(text):
    return re.sub(t_markup_regex, t_markup_placeholder, text)


def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)


def replace_emoji_with_code(text):
    demoji.replace(text, repl=emoji_placeholder)
    return demoji.replace_with_desc(text)


def get_stats(step, df):
    corpus = " ".join(list(df['text']))
    total_words = len(corpus.split(' '))
    unique_words = len(set(corpus.split(' ')))
    return [step, total_words, unique_words]

def lemmatize(text):
    default_wn_tag = 'n'
    tokens = text.split(' ')
    pos_tags = nltk.pos_tag(tokens)
    wn_tags = [penn_to_wn(tag) for (w, tag) in pos_tags]
    # print(list(zip(pos_tags, wn_tags)))
    lemmas = [wnl.lemmatize(token, tag or default_wn_tag)
              for (token, tag) in list(zip(tokens, wn_tags))]
    return ' '.join(lemmas)

def fix_contractions(text):
    return contractions.fix(text, slang=False)

def trim_excessive_space(text):
    return " ".join(text.split())

# TODO: https://www.geeksforgeeks.org/spelling-checker-in-python/
def spell_correct(text):
    textBlb = TextBlob(text)            # Making our first textblob
    textCorrected = textBlb.correct()
    return textCorrected

In [2]:
"""
Preprocesses the data.
"""
def process_data(df, **kwargs):
    stats = [['Step', 'Total words', 'Unique words']]
    stats.append(get_stats('Start', df))
    df = df.replace(label_codes)
    df['orig_text'] = df['text']

    if kwargs.get('handle_retweet'):
        df = df[~df['text'].str.startswith('RT')]
        stats.append(get_stats('Remove Retweet', df))
    else:
        stats.append(['Remove Retweet', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_case'):
        df['text'] = df['text'].apply(lambda text: text.lower())
        stats.append(get_stats('Lower', df))
    else:
        stats.append(['Lower', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_contractions'):
        df['text'] = df['text'].apply(lambda text: fix_contractions(text))
        stats.append(get_stats('Remove Retweet', df))
    else:
        stats.append(['Remove Retweet', 'xxxxxx', 'xxxxxx'])


    if kwargs.get('handle_lemmatization'):
        df['text'] = df['text'].apply(lambda text: lemmatize(text))
        stats.append(get_stats('Lemmatize', df))
    else:
        stats.append(['Lemmatize', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_unicode'):
        df['text'] = df['text'].apply(fix_unicode)
        stats.append(get_stats('Unicode Fix', df))
    else:
        stats.append(['Unicode Fix', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_emoji'):
        df['text'] = df['text'].apply(replace_emoji_with_code)
        stats.append(get_stats('Replace emoji', df))
    else:
        stats.append(['Replace emoji', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_stopwords'):
        df['text'] = df['text'].apply(remove_stop_words)
        stats.append(get_stats('Stop words', df))
    else:
        stats.append(['Stop words', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_email'):
        df['text'] = df['text'].apply(replace_email)
        stats.append(get_stats('Email Replace', df))
    else:
        stats.append(['Email Replace', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_username'):
        df['text'] = df['text'].apply(replace_user_name)
        stats.append(get_stats('UserName replace', df))
    else:
        stats.append(['UserName replace', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_hashtags'):
        df['text'] = df['text'].apply(replace_hashtags)
        stats.append(get_stats('HashTags Replace', df))
    else:
        stats.append(['HashTags Replace', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_url'):
        df['text'] = df['text'].apply(replace_url)
        stats.append(get_stats('URL Replace', df))
    else:
        stats.append(['URL Replace', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_markup'):
        df['text'] = df['text'].apply(replace_markup)
        stats.append(get_stats('MARKUP Replace', df))
    else:
        stats.append(['MARKUP Replace', 'xxxxxx', 'xxxxxx'])

    if kwargs.get('handle_spelling'):
        df['text'] = df['text'].apply(spell_correct)
        stats.append(get_stats('Spell Correct', df))
    else:
        stats.append(['Spell Correct', 'xxxxxx', 'xxxxxx'])


    if kwargs.get('handle_punctuation'):
        df['text'] = df['text'].apply(remove_punctuations)
        stats.append(get_stats('Remove punctuation', df))
    else:
        stats.append(['Remove punctuation', 'xxxxxx', 'xxxxxx'])


    df['text'] = df['text'].apply(lambda t: trim_excessive_space(t))
    print(tabulate(stats)) if kwargs.get('print_stats', False) else None
    return df


In [None]:
from datetime import datetime
import os
import pickle
import jsonpickle
from collections import OrderedDict
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, fbeta_score


root = '/home/gaurav.gupta/projects/PoCs/brandMention/brand_ml'

def get_model_info(model, model_name):
    info = {
        'name': str(model.__class__),
        model_name: model.get_params()
    }
    return info


def get_experiment_info(model_dict, tags):
    experiment_info = {
        'models_info': [get_model_info(model_dict[m], m) for m in model_dict.keys()],
        'tags': tags,
        'timestamp': datetime.now().isoformat()}
    return experiment_info


# UTIL to serialize classifier to disk along with important info.
def save_experiment(experiment_name, model_dict, tags, results={}, preprocess_details={}):
    folder = os.path.join(root, 'experiments', experiment_name)
    # models = list(model_dict.values())
    model_names = model_dict.keys()
    readme_file_path = os.path.join(folder, 'artifacts.json')
    # results_fp = os.path.join(folder, 'results.csv')

    experiment_info = get_experiment_info(model_dict, tags)
    experiment_info['name'] = experiment_name
    experiment_info['results'] = results
    experiment_info['preprocess_config'] = preprocess_details

    # Write models.
    os.makedirs(folder)
    [pickle.dump(model_dict[name], open(os.path.join(
        folder, f'{name}.pickle'), 'wb')) for name in model_names]

    frozen = jsonpickle.encode(experiment_info, indent=4)
    with open(readme_file_path, "w") as outfile:
        outfile.write(frozen)
    return

In [None]:
# UTIL to load classifier from disk
def load_model(experiment_name):
    # pickle.dump(tfidf, open("./models/tfidf_rf_f1_9371.pickle", "wb"))
    # dump(clf, './models/rf_f1_9371.joblib')
    experiment_path = f"{root}/experiments/{experiment_name}"
    vectorizer = pickle.load(
        open(f"{experiment_path}/vectorizer.pickle", "rb"))
    clf = pickle.load(open(f"{experiment_path}/classifier.pickle", 'rb'))
    model_dict = {'classifier': clf, 'vectorizer': vectorizer}
    return model_dict


# LOAD THE EXPERIMENT ARTIFACTS
# model_dict = load_model(experiment_name)
# sample_vector = model_dict['vectorizer'].transform(['This raise complaint'])
# sample_predic = model_dict['classifier'].predict(sample_vector)
# sample_predic
