In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,recall_score,precision_score, fbeta_score
import xgboost as xgb
import nltk
from tabulate import tabulate
import os
import matplotlib.pyplot as plt

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaurav.gupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
%run ./_preprocess.ipynb

ds_root = '/home/gaurav.gupta/projects/PoCs/brandMention/brand_datasets/'
r_path = os.path.join(ds_root, 'ds_complaints', 'panasonic_random_sample_predicted.csv')
r_df = read_file(r_path)

m_path = os.path.join(ds_root, 'ds_complaints', 'panasonic_v1_g.csv')
m_df = read_file(m_path)

raw_df = pd.concat([r_df, m_df])
raw_df.reset_index(drop=True, inplace=True)
options = {
    'handle_unicode': True,
    'handle_emoji': True,
    'handle_email': True,
    'handle_username': True,
    'handle_hashtags': True,
    'handle_url': True,
    'handle_markup': True,
    'handle_retweet': True,
    'handle_case': True,
    'handle_lemmatization': True,
    'handle_stopwords': True,
    'handle_punctuation': True,
    'handle_contractions': True,
    'print_stats': True
}
df = process_data(raw_df, **options)

# replace placeholders.
df.text = df.text.str.replace('HASHTAG', '', )
df.text = df.text.str.replace('HANDLE', '')
df.text = df.text.str.replace('URL', '')
df.text = df.text.str.replace('rt', '')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaurav.gupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


------------------  -----------  ------------
Step                Total words  Unique words
Start               77684        20305
Remove Retweet      66365        18473
Lower               66365        16581
Remove Retweet      66649        16532
Lemmatize           66649        15779
Unicode Fix         66656        15752
Replace emoji       67043        15848
Stop words          50235        15273
Email Replace       50235        15266
UserName replace    50921        14248
HashTags Replace    64615        10461
URL Replace         64615        8398
MARKUP Replace      64615        8397
Remove punctuation  64615        6668
------------------  -----------  ------------


In [3]:
from sklearn.metrics import plot_confusion_matrix
from collections import OrderedDict


def test_model(model):
    pred_test = model.predict(x_test)
    model_acc_score = accuracy_score(y_test, pred_test)
    model_f1_score = f1_score(y_test, pred_test)
    model_prec_score = precision_score(y_test, pred_test)
    model_recall_score = recall_score(y_test, pred_test)
    model_f2_score = fbeta_score(y_test, pred_test, beta=2)
    results = []
    results.append(('Metric',   'Score'))
    results.append(('Accuracy', model_acc_score*100))
    results.append(('F1 Score', model_f1_score*100))
    results.append(('F-Beta Score', model_f2_score*100))
    results.append(('Precision Score', model_prec_score*100))
    results.append(('Recall Score', model_recall_score*100))
    print(tabulate(results))
    return pred_test


def plot_important_features(clf, tfidf, axs):
    # Display Important feature
    n = 20
    f_imp = [list(i) for i in list(
        zip(clf.feature_importances_, tfidf.get_feature_names()))]
    f_imp_df = pd.DataFrame(f_imp, columns=['Importance', 'Feature'])
    top_n_features = f_imp_df.sort_values(
        by=['Importance'], ascending=False).head(n).sort_values(by=['Importance'])
    axs[1].barh(top_n_features['Feature'], top_n_features['Importance'])
    axs[1].set_title('MOST Important feature as per RF')
    return


In [27]:

from sklearn.model_selection import StratifiedKFold
labelEncoder = OrderedDict({'Not-A-Complaint': 0, 'Is-A-Complaint': 1, })
display_labels = list(labelEncoder.keys())
labels = list(labelEncoder.values())

X = df['text']
y = df['Complaint']
tfidf = TfidfVectorizer(lowercase=False, max_df=.8, min_df=0.01, ngram_range=(1, 2))
vectors = tfidf.fit_transform(X).toarray()
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['Complaint'], random_state=42, test_size=0.30)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


for train_index, test_index in skf.split(x_train, y_train):

    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    X_train_fold, X_test_fold = x_train[train_index], x_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    print(vectors.shape)

    model_x = xgb.XGBClassifier(use_label_encoder=False,                            
                                # helps in avoding the overfitting problem and increase in learning rate affects TP and FN
                                learning_rate=0.40,
                                n_estimators=150,  # number of trees to be built and increase in n_estimators affects TP and FP
                                max_depth=4,  # represents the depth of each tree and increase in max_depth affects TN and FN
                                subsample=0.8,  # for each tree the % of rows taken to build the tree and increase in subsample affects TN and FN
                                min_child_weight=.9,  # Defines the minimum sum of weights of all observations required in a child and increase in min_child_weight affects TN and FN
                                reg_alpha=0.6,  # penalizes the features which increase cost function and high value of reg_alpha affects TN and FN rate
                                # encourages the weights to be small and increase in reg_lambda affects TN and FN
                                reg_lambda=0.3,
                                gamma=0.01,
                                seed=42).fit(x_train, y_train)

    pred_test = test_model(model_x)

test_vector = tfidf.transform(x_test).toarray()
pred_test = test_model(model_x)
fig, axs = plt.subplots(1, 2, figsize=(25, 10), gridspec_kw={'width_ratios': [1, 4]})
plot_confusion_matrix(model_x, x_test, y_test, labels=labels, display_labels=display_labels, ax=axs[0])
plot_important_features(model_x, tfidf, axs)
plt.show()


TRAIN: (1524,) TEST: (170,)


KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([   2,    3,    4,   11,   12,\n            ...\n            1675, 1680, 1681, 1683, 1692],\n           dtype='int64', length=863). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [31]:
import shutil
%run ./_preprocess.ipynb

root = '/home/gaurav.gupta/projects/PoCs/brandMention/brand_ml'
model_dict = OrderedDict({'classifier': model_x, 'vectorizer': tfidf})
# experiment_name = "<Algo_name>_<MetricName>_<NGrams>_<MetricValue>_<date>"
experiment_name = "Xgb_FBeta_Biigram_8538_Jan04"
tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
experiment_results = {
    "tn": float(tn), "fp": float(fp), "fn": float(fn), "tp": float(tp),
    "model_acc_score": float(accuracy_score(y_test, pred_test)),
    "model_f1_score": float(f1_score(y_test, pred_test)),
    "model_prec_score": float(precision_score(y_test, pred_test)),
    "model_recall_score": float(recall_score(y_test, pred_test)),
    "model_f2_score": float(fbeta_score(y_test, pred_test, beta=2)),
}

notes = {'details': ['Uni Model', 'XgBoost']}

# SAVE THE EXPERIMENT ARTIFACTS
shutil.rmtree(os.path.join(root, 'experiments',
              experiment_name), ignore_errors=True)
save_experiment(experiment_name, model_dict, tags=notes,
                results=experiment_results, preprocess_details=options)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaurav.gupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
# model_dict = load_model(experiment_name)
# sample_vector = model_dict['vectorizer'].transform(['This raise complaint'])
# sample_predic = model_dict['classifier'].predict(sample_vector)
# sample_predic
