In [0]:
!pip install zeugma

Collecting zeugma
  Downloading https://files.pythonhosted.org/packages/37/06/987f0c591e4f46fc31446d541d17d63981394f099c455955680dfe7bd980/zeugma-0.46.tar.gz
Building wheels for collected packages: zeugma
  Building wheel for zeugma (setup.py) ... [?25l[?25hdone
  Created wheel for zeugma: filename=zeugma-0.46-cp36-none-any.whl size=8612 sha256=49def7ae26ad658c4c79be9d58f4b0e74277f953db6e93f1a9a7422ec69bbf03
  Stored in directory: /root/.cache/pip/wheels/49/ce/d3/22bc15de9112558b220d9dba3bfcd7d9ad0d8cc4d44d3e7813
Successfully built zeugma
Installing collected packages: zeugma
Successfully installed zeugma-0.46


In [0]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from zeugma.embeddings import EmbeddingTransformer

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
def preprocess(data):
    '''                                                                         
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution   
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°\
£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = clean_special_chars(str(data), punct)
    data = data.split()
    stop_words = set(stopwords.words('english'))
    cleaned = [word for word in data if word not in stop_words]
    return " ".join(cleaned)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
train_file = "/content/drive/My Drive/MachineLearning/toxic/train.csv"
df = pd.read_csv(train_file)


In [0]:
#Clean data
df['clean_text'] = df['comment_text'].apply(lambda x: preprocess(x))

In [0]:
#Split into training and test data
train, test = train_test_split(df, test_size=0.2)
x_train = train['clean_text']
x_test = test['clean_text']
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_test = np.where(test['target'] >= 0.5, 1, 0)
#y_cat_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
#y_cat_test = test[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

In [0]:
y_cat_train.head()

Unnamed: 0,target,severe_toxicity,obscene,identity_attack,insult,threat
1491557,0.0,0.0,0.0,0.0,0.0,0.0
1744262,0.0,0.0,0.0,0.0,0.0,0.0
213320,0.0,0.0,0.0,0.0,0.0,0.0
751548,0.0,0.0,0.0,0.0,0.0,0.0
344134,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
#Encode training data with glove vectors
glove = EmbeddingTransformer('glove')
x_train = glove.transform(x_train)

Using TensorFlow backend.




  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
#Fit LR model
target_model = LogisticRegression(C=5, random_state=42, solver='sag', max_iter=1000, n_jobs=-1)
target_model.fit(x_train, y_train)


LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='sag', tol=0.0001, verbose=0, warm_start=False)

In [0]:
#Validate model on test data
x_test_glove = glove.transform(x_test)
predictions = target_model.predict_proba(x_test_glove)[:,1]


In [0]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': predictions
})

In [0]:
submission

Unnamed: 0,id,prediction
380532,708537,0
1089178,5447745,0
955143,5287055,0
1099517,5460127,0
1068116,5421929,0
...,...,...
1759729,6278512,0
15882,261688,0
161388,439259,0
398668,730690,0


In [0]:
# From baseline kernel
from sklearn import metrics
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]>0.5
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)



SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]>0.5]
    return compute_auc((subgroup_examples[label]>0.5), subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[(df[subgroup]>0.5) & (df[label]<=0.5)]
    non_subgroup_positive_examples = df[(df[subgroup]<=0.5) & (df[label]>0.5)]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[(df[subgroup]>0.5) & (df[label]>0.5)]
    non_subgroup_negative_examples = df[(df[subgroup]<=0.5) & (df[label]<=0.5)]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]>0.5])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)


In [0]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
MODEL_NAME = 'model1'
test[MODEL_NAME]= submission["prediction"]
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test, MODEL_NAME))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


0.7612232051135233