# Preparing Data and Model Building

In [1]:
#workflow and statistics
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt

#for showing missing values
import missingno as msno

#visualisation
import seaborn as sns

#natural language processing toolkit
import nltk
import string
from nltk.corpus import stopwords  # removes useless words
from nltk.stem.lancaster import LancasterStemmer  #converts the words to base form; aggressive
from nltk.stem import porter
from nltk.stem.util import suffix_replace, prefix_replace
from nltk.stem.api import StemmerI
from nltk.stem import SnowballStemmer

#create a wordcloud of often used words
import wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
nltk.download('stopwords')

#accessing google cloud storage
#from google.cloud import storage
#from io import BytesIO
#client = storage.Client()
#bucket = "bilderkennung_nf_2020"

#building baseline classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Grid search cross validation
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#train_data = pd.read_csv("gs://bilderkennung_nf_2020/data /jigsaw-toxic-comment-train.csv")

In [2]:
train_data = pd.read_csv('./jigsaw-toxic-comment-train.csv')

In [None]:
#valid_data = pd.read_csv("gs://bilderkennung_nf_2020/data /validation.csv")

In [3]:
valid_data = pd.read_csv("./validation.csv")

In [4]:
train_data = train_data.drop('id', axis=1)

In [5]:
train_data.head(2)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [6]:
valid_data = valid_data.drop('id', axis=1)

In [7]:
valid_data.head(2)

Unnamed: 0,comment_text,lang,toxic
0,Este usuario ni siquiera llega al rango de ...,es,0
1,Il testo di questa voce pare esser scopiazzato...,it,0


## Preprocessing

In [None]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
train_data['comment_text'] = train_data['comment_text'].apply(remove_punctuation)
valid_data['comment_text'] = valid_data['comment_text'].apply(remove_punctuation)

In [None]:
sw=stopwords.words('english')
def removesw(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)
train_data['comment_text'] = train_data['comment_text'].apply(removesw)
valid_data['comment_text'] = valid_data['comment_text'].apply(removesw)

In [None]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train_data['comment_text'] = train_data['comment_text'].apply(stemming)
valid_data['comment_text'] = valid_data['comment_text'].apply(stemming)

# Baseline Model: Logistic Regression

Logistic Regression is [well suited](https://blog.insightdatascience.com/always-start-with-a-stupid-model-no-exceptions-3a22314b9aaa) as a baseline model for classification and natural language processing. Baseline models take less tims to construct since there architecture is relatively simple. They are easier and faster to train, so you can iterate quickly through them. This advantage helps to deal with bugs that point to data issues, as well.
Hence, they give you information to build on in a short time. Baseline models function as benchmarks for more complex models. By studying the shortcomings and struggles of our baseline model we can make decisions on what complex model to deploy next. Therefore, the baseline model also gives a methodological orientation. 

**Vectorizer**

In [None]:
tfidf_vec = TfidfVectorizer(max_df=0.7,stop_words='english')

**Regression Model: predicting 'toxic'**

In [None]:
X = train_data['comment_text']
y = train_data['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_vec = tfidf_vec.fit_transform(X_train)
X_test_vec = tfidf_vec.transform(X_test)

log_toxic = LogisticRegression()
log_toxic.fit(X_train_vec,y_train)

predictions = log_toxic.predict(X_test_vec)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

Out baseline model (Logistic Regression) performs with 95% accuracy on the training data.
However, since we habe imbalanced data, we want to improve on the precision and recall values.

In [None]:
confusion_matrix = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

print('Accuracy: ',metrics.accuracy_score(y_test, predictions))

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l2 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train_vec,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
accuracy : 0.950686690973072

**Resampling imbalanced data**

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state = 2)

X_train_res, y_train_res = sm.fit_sample(X_train_vec, y_train.ravel())

**Execute GridSearch for Model Optimization**

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l2 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train_res,y_train_res)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
accuracy : 0.9467282127031019

## Second Baseline Model LSTM

In the following steps, we will set the key model parameters and split the data.
- “MAX_NB_WORDS” sets the maximum number of words to consider as features for tokenizer.
- “MAX_SEQUENCE_LENGTH” cuts off texts after this number of words (among the MAX_NB_WORDS most common words).
- “VALIDATION_SPLIT” sets a portion of data for validation and not used in training.
- “EMBEDDING_DIM” defines the size of the “vector space”.
- “GLOVE_DIR” defines the GloVe file directory.
- Split the data into the texts and the labels.


In [8]:
import re
from tqdm import tqdm_notebook

from nltk.corpus import stopwords

from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

Using TensorFlow backend.


In [49]:
import os
import glob

In [68]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
#GLOVE_DIR = 


In [9]:
train = train_data

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = train[labels].values
comments_train = train['comment_text']
comments_train = list(comments_train)

## Preprocessing

remove stopwords, punctuation and make everything lowercase:

In [10]:
def clean_text(text, remove_stopwords = True):
    output = ""
    text = str(text).replace("\n", "")
    text = re.sub(r'[^\w\s]','',text).lower()
    if remove_stopwords:
        text = text.split(" ")
        for word in text:
            if word not in stopwords.words("english"):
                output = output + " " + word
    else:
        output = text
    return str(output.strip())[1:-3].replace("  ", " ")
    
texts = [] 

for line in tqdm_notebook(comments_train, total=159571): 
    texts.append(clean_text(line))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




 Have a look a sample data:

In [12]:
print('Sample data:', texts[1], y[1])

Sample data: aww matches background colour im seemingly stuck thanks talk 2151 january 11 2016  [0 0 0 0 0 0]


### Tokenizer

- We create a tokenizer, configured to only take into account the MAX_NB_WORDS most common words.
- We build the word index.
- We can recover the word index that was computed

In [13]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 431032


Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen)
Pad after each sequence.

In [14]:
data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (223549, 200)
Shape of label tensor: (223549, 6)


Shuffle the data:

In [15]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]

Create the train-validation split:

In [16]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]
print('Number of entries in each category:')
print('training: ', y_train.sum(axis=0))
print('validation: ', y_val.sum(axis=0))

Number of entries in each category:
training:  [17069  1549  9702   553  8995  1695]
validation:  [4315  413 2438  136 2309  422]


## Create the model

We will use pre-trained GloVe vectors from Stanford to create an index of words mapped to known embeddings, by parsing the data dump of pre-trained embeddings.
Then load word embeddings into an embeddings_index

In [74]:
embeddings_index = {}
#with open("GloVe/glove.6B/glove.6B.50d.txt", 'r') as f:
f= open("glove.6B.100d.txt","w+")
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
    f.close()
    print("Done.\n Proceeding with Embedding Matrix...", end="")

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(" Completed!")

IndentationError: unexpected indent (<ipython-input-74-72f19db7fe6b>, line 4)