## 0.84+ score by ensemble of simple TF-Idf and Ridge regression

### Ensemble of TfIdf - Ridge models using data from 
- Toxic competition
- Toxic CLEANED competition
- Ruddit toxic data
- Toxic multilingual competition

### Analysis of bad predictions


#### Some cool starters notebooks : 
- https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768
- https://www.kaggle.com/steubk/jrsotc-ridgeregression-ensemble-of-3

# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import re 
import scipy
from scipy import sparse
import gc 

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100


In [None]:
def timer(func):
    def wrapper(*args, **kws):
        st = time.time()
        res = func(*args, **kws)
        et = time.time()
        tt = (et-st)/60
        print(f'Time taken is {tt:.2f} mins')
        return res
    return wrapper


# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
df_test_l = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
print(df_test.shape)
df_test = pd.merge(df_test, df_test_l, how="left", on = "id")
df_test.shape

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)
df = pd.concat([df, df_test])
print(df.shape)
del df_test

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(df.loc[df[col]==1,['comment_text',col]].sample(5))

In [None]:

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df['y'].value_counts()

# Load validation data & filter for overlapping sentences

In [None]:
# Validation data 

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)


# Find cases already present in toxic data

df_val = pd.merge(df_val, df.loc[:,['text']], 
                  left_on = 'less_toxic', 
                  right_on = 'text', how='left')

df_val = pd.merge(df_val, df.loc[:,['text']], 
                  left_on = 'more_toxic', 
                  right_on = 'text', how='left')

# Removing those cases
df_val = df_val[(~df_val.text_x.isna()) | (~df_val.text_y.isna())][['worker', 'less_toxic', 'more_toxic']]
df_val.shape

## Create 3 versions of the TOXIC data

In [None]:
n_folds = 2

frac_1 = 0.7
frac_1_factor = 1.3


In [None]:
@timer
def create_folds():
    for fld in range(n_folds):
        print(f'Fold: {fld}')
        tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                            df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                                random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

        tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index=False)
        print(tmp_df.shape)
        print(tmp_df['y'].value_counts())


create_folds()

# Create 3 versions of __clean__ TOXIC data

In [None]:
@timer
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    
    return data

In [None]:
# Test clean function
test_clean_df = pd.DataFrame({"text":
                              ["heyy\n\nkkdsfj",
                               "hi   how/are/you ???",
                               "hey?????",
                               "hey????? 18.98.333.20 18.98.",
                               "noooo!!!!!!!!!   comeone !! ",
                              "cooooooooool     brooooooooooo  coool brooo",
                              "naaaahhhhhhh"]})
display(test_clean_df)
clean(test_clean_df,'text')

In [None]:
df = clean(df,'text')

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

    tmp_df.to_csv(f'/kaggle/working/df_clean_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

In [None]:
del df,tmp_df
gc.collect()

## Read toxic Ruddit data

In [None]:
df_ = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(df_.shape)

df_ = df_[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})

df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min()) 
df_.y.hist()

# Create 3 versions of RUDDIT data

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = df_.sample(frac=frac_1, random_state = 10*(fld+1))
    tmp_df.to_csv(f'/kaggle/working/df2_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

In [None]:
del tmp_df, df_; 
gc.collect()

## Read Jigsaw multilingual data CLEANED

In [None]:
dfm = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
print(dfm.shape)

dfm = clean(dfm,'comment_text')

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(dfm.loc[dfm[col]==1,['comment_text',col]].sample(5))
    

# Give more weight to severe toxic 
dfm['severe_toxic'] = dfm.severe_toxic * 2
dfm['y'] = (dfm[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
dfm['y'] = dfm['y']/dfm['y'].max()

dfm = dfm[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
dfm.y.value_counts()

# Create 3 versions of Multilingual data

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([dfm[dfm.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        dfm[dfm.y==0].sample(n=int(len(dfm[dfm.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, 
                                                                                        random_state = 10*(fld+1))

    tmp_df.to_csv(f'/kaggle/working/dfm_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

## Load Test data  


In [None]:
# Validation data 

# df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# df_val.shape

### Remove contradicting cases from validation data
- cases where contradictory evaluation is in minority (< 50%)

In [None]:
# gp1=df_val.copy()
# gp1['pair'] = gp1.apply(lambda x:" ".join(sorted((x['less_toxic'],
#                                                   x['more_toxic']))),axis=1)
# gp1['pair_hash'] = gp1.pair.apply(lambda x: str(abs(hash(x)) % (10 ** 8)))
# del gp1['pair']
# print(len(gp1), len(gp1.pair_hash.drop_duplicates()))

# gp1['cnt']=gp1.groupby(['pair_hash', 
#                         'less_toxic',
#                         'more_toxic']).transform(lambda x: x.count())
# print(gp1[['pair_hash', 'less_toxic', 'more_toxic','cnt']].drop_duplicates().cnt.value_counts())

# #gp1.head(10)
# majority_cases = gp1.groupby('pair_hash')\
#                     .agg({'cnt':['count','max']})\
#                     .reset_index()\
#                     .set_axis(['pair_hash','count','max'], 
#                               axis='columns')\
#                     .assign(pct=lambda x: x['max']/x['count'])\
#                     .query('pct>=0.5')\
#                     .rename(columns={'max':'cnt'})\
#                     [['pair_hash','cnt']]

# df_val = pd.merge(gp1,majority_cases,
#                  how="inner",
#                  on = ['pair_hash','cnt'])
# #gp1.groupby('pair_hash').apply(lambda x: x[['less_toxic','more_toxic','cnt']].sort_values('cnt', ascending=False))
# df_val.shape

In [None]:
# Test data

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub.shape

# Create Sklearn Pipeline with 
-  TFIDF - Take 'char_wb' as analyzer to capture subwords well
-  Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

In [None]:

class LengthTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[(len(x)-360)/550] for x in X])
    def get_feature_names(self):
        return ["lngth"]

class LengthUpperTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[int(sum([1 for y in x if y.isupper()])/len(x) > 0.75) ] for x in X])
    def get_feature_names(self):
        return ["lngth_uppercase"]

### Does % of uppercase characters have effect on toxicity


In [None]:

# df_val['upper_1'] = np.array(LengthUpperTransformer().transform(df_val['less_toxic']).todense()).reshape(-1,1)
# df_val['upper_2'] = np.array(LengthUpperTransformer().transform(df_val['more_toxic']).todense()).reshape(-1,1)

# print(df_val['upper_1'].mean(), df_val['upper_1'].std())
# print(df_val['upper_2'].mean(), df_val['upper_2'].std())

# df_val['upper_1'].hist(bins=100)
# df_val['upper_2'].hist(bins=100)

## Train pipeline

- Load folds data
- train pipeline
- Predict on validation data
- Predict on test data

# Training function

In [None]:
@timer
def train_pipeline(pipeline, data_path_name, n_folds, clean_prm = False):
    val_preds_arr1_tmp = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2_tmp = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr_tmp = np.zeros((df_sub.shape[0], n_folds))

    for fld in range(n_folds):
        print("\n\n")
        print(f' ****************************** FOLD: {fld} ******************************')
        df = pd.read_csv(f'/kaggle/working/{data_path_name}_fld{fld}.csv')
        print(df.shape)

        print("\nTrain:")
        # Train the pipeline
        pipeline.fit(df['text'], df['y'])

        # What are the important features for toxicity

        print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

        feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                      np.round(pipeline['clf'].coef_,2) )), 
                             key = lambda x:x[1], 
                             reverse=True)

        display(pd.DataFrame(feature_wts[:50], columns = ['feat','val']).T)
        #.plot('feat','val',kind='barh',figsize = (8,8) )
        #plt.show()

        if clean_prm:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(clean(df_val,'less_toxic')['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(clean(df_val,'more_toxic')['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(clean(df_sub,'text')['text'])
        else:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(df_val['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(df_val['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(df_sub['text'])
    return val_preds_arr1_tmp, val_preds_arr2_tmp, test_preds_arr_tmp

# Toxic Training

In [None]:
features = FeatureUnion([
    #('vect1', LengthTransformer()),
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),
    #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
        #("clf",LinearRegression())
    ]
)

val_preds_arr1, val_preds_arr2, test_preds_arr = train_pipeline(pipeline, 
                                                                "df", 
                                                                n_folds,
                                                                clean_prm=False)


# Toxic __clean__ Training

In [None]:
features = FeatureUnion([
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
    ]
)

val_preds_arr1c, val_preds_arr2c, test_preds_arrc = train_pipeline(pipeline, 
                                                                   "df_clean", 
                                                                   n_folds,
                                                                   clean_prm=True)


## Ruddit data Training

In [None]:
features = FeatureUnion([
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
    ]
)

val_preds_arr1_, val_preds_arr2_, test_preds_arr_ = train_pipeline(pipeline, 
                                                                   "df2", 
                                                                   n_folds,
                                                                   clean_prm=False)


## Mulitlingual data Training

In [None]:
features = FeatureUnion([
    #('vect1', LengthTransformer()),
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),
    #("vect4", CountVectorizer(min_df= 5, max_df=0.3, analyzer = 'word', ngram_range = (2,3), token_pattern=r'(?u)\b\w{3,}\b', binary=True))
])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
        #("clf",LinearRegression())
    ]
)

val_preds_arr1m, val_preds_arr2m, test_preds_arrm = train_pipeline(pipeline, 
                                                                    "dfm", 
                                                                    n_folds,
                                                                   clean_prm=True)


In [None]:
# del df, pipeline, feature_wts
# gc.collect()

# Validate the pipeline 

In [None]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')

print(" Toxic Mulitlingual data ")
p7 = val_preds_arr1m.mean(axis=1)
p8 = val_preds_arr2m.mean(axis=1)

print(f'Validation Accuracy is { np.round((p7 < p8).mean() * 100,2)}')


## Optimize the model weights for ensemble

In [None]:

@timer
def optimize_wts():
    func = lambda x: -1*(((x[0]*p1 + x[1]*p3 + x[2]*p5 + x[3]*p7) < (x[0]*p2 + x[1]*p4 + x[2]*p6  + x[3]*p8)).mean())

    rranges = (slice(0.20, 0.6, 0.015), 
               slice(0.05, 0.5, 0.015),
               slice(0.05, 0.5, 0.015),
               slice(0.05, 0.5, 0.015),
              )

    resbrute = optimize.brute(func, 
                              rranges, 
                              #args=params, 
                              full_output=True,
                              finish=None)
    return resbrute
resbrute = optimize_wts()

print(resbrute[0])  # global minimum
print(resbrute[1]*-1)  # function value at global minimum
    

In [None]:
w1,w2,w3,w4 = resbrute[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5 + w4*p7
p2_wt = w1*p2 + w2*p4 + w3*p6 + w4*p8


## Analyze bad predictions 
### Incorrect predictions with similar scores
### Incorrect predictions with different scores

In [None]:
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')


In [None]:

### Incorrect predictions with similar scores

df_val[(df_val.correct == 0) & (df_val.p1 < 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(20)

In [None]:
df_val[(df_val.correct == 0) & (df_val.p1 > 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(20)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

In [None]:
df_val[(df_val.correct == 0) & (df_val['diff'] < 0.4*df_val['diff'].max())].sort_values('diff', ascending=False).head(20)


# Predict on test data 

In [None]:
# Predict using pipeline

df_sub['score'] = w1*test_preds_arr.mean(axis=1) + \
                  w2*test_preds_arr_.mean(axis=1) + \
                  w3*test_preds_arrc.mean(axis=1) + \
                  w4*test_preds_arrm.mean(axis=1)

In [None]:
#test_preds_arr

## Correct the rank ordering

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
same_score = df_sub['score'].value_counts().reset_index()[:10]
same_score

In [None]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

In [None]:
# Same comments have same score - which is ok 

In [None]:
# # Rank the predictions 

# df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

# print(df_sub['score'].rank().nunique())

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)