In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import transformers as ppb
import warnings
import os
import re
import gc
import torch
from tqdm import tqdm
import tensorflow as tf 
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
gc.collect()

# Converting the sentences into tokens 

In [None]:
train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")

In [None]:
train = reduce_mem_usage(train)
gc.collect()

In [None]:
train['target'].value_counts()
#train['target'].value_counts()

In [None]:
indexes = []
for f,i in enumerate(train['comment_text']):
    if len(i) > 512: 
        indexes.append(f)
gc.collect()        

In [None]:
df = train.iloc[indexes]

In [None]:
df

In [None]:
df['toxic'] = np.where(df['target'] >= .5, 1, 0)

In [None]:
df['toxic'].value_counts()

In [None]:
train['toxic'] = np.where(train['target'] >= .5, 1, 0)

In [None]:
train['toxic'].value_counts()

In [None]:
gc.collect()

In [None]:
train = train.drop(index= indexes)

In [None]:
train.head(30)

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
gc.collect()

In [None]:
del indexes
gc.collect()

In [None]:
labels = train["target"] 

In [None]:
 train['comment_text']= train['comment_text'].astype(str)

In [None]:
 train

In [None]:
tokenized = train['comment_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
gc.collect()

In [None]:
del train, df
gc.collect()

In [None]:
len(tokenized)

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
gc.collect()

In [None]:
del max_len
gc.collect()


In [None]:
print(np.array(padded).shape)
gc.collect()

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)
gc.collect()

In [None]:
del tokenized
gc.collect()

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)


with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)