## Downloading Data

In [7]:
import zipfile
import os
from kaggle.api.kaggle_api_extended import KaggleApi


def get_kaggle_data_set(
    project: str, 
    document: str, 
    directory_to_extract_to: str
    ):
    api = KaggleApi()
    api.authenticate()

    api.dataset_download_file(project, document)
    zip_document = document + ".zip"
    with zipfile.ZipFile(zip_document, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

    os.remove(zip_document)


document = "cyberbullying_tweets.csv"
project = "andrewmvd/cyberbullying-classification"
directory_to_extract_to = r"C:\Users\EMILIO\Documents\Python Scripts\CyberbullingProject\model\data"

get_kaggle_data_set(project, document, directory_to_extract_to)


## Train-Test-Split

In [1]:
import pandas as pd
from pathlib import WindowsPath
from sklearn.model_selection import train_test_split

In [2]:
# Data extraction
tweets_path = WindowsPath(r"C:\Users\EMILIO\Documents\Python Scripts\CyberbullingProject\model\data\cyberbullying_tweets.csv")
tweets_data = pd.read_csv(tweets_path)

tweets_data["is_cyberbullying"] = tweets_data["cyberbullying_type"].apply(lambda x: x != "not_cyberbullying")
tweets_data["is_cyberbullying"] = tweets_data["is_cyberbullying"].astype(dtype = int)

In [3]:
X = tweets_data["tweet_text"]
Y = tweets_data["is_cyberbullying"]

# Splitting
X_tr_val, X_test, y_tr_val, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=33)

X_train, X_val, y_train, y_val = train_test_split(
    X_tr_val, y_tr_val, test_size=0.1, random_state=12)


In [4]:
y_train.value_counts()

1    32223
0     6406
Name: is_cyberbullying, dtype: int64

## Undersampling

In [5]:
# Random Undersampling
train_df = pd.DataFrame(
    X_train, columns=["tweet_text"], index=y_train.index.values)
train_df['is_cyberbullying'] = y_train

len_cyber = len(train_df[train_df["is_cyberbullying"] == 1])
len_not_cyber = len(train_df[train_df["is_cyberbullying"] == 0])
n_del = len_cyber - len_not_cyber
indeces_to_del = train_df[train_df["is_cyberbullying"]
                          == 1].sample(n=n_del, random_state=3).index

train_df_u = train_df[~train_df.index.isin(indeces_to_del)]

train_df_u["is_cyberbullying"].value_counts()


0    6406
1    6406
Name: is_cyberbullying, dtype: int64

In [6]:
# Reasign balanced training and testing
X_train_u = train_df["tweet_text"]
y_train_u = train_df["is_cyberbullying"]

## Tokenization and Padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from typing import List, Union

X_train_list = X_train_u.tolist()
X_train_gen = (x for x in X_train_list)


In [8]:
vocab_size = 10000
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train_gen)

In [9]:
def get_padded_data(
    tokenizer: Tokenizer, 
    data: Union[iter, pd.Series], 
    max_length: int = 700,
    trunc_type: str = "post") -> List:

    sequences = tokenizer.texts_to_sequences(data)
    padded = pad_sequences(sequences, maxlen=max_length,
                       truncating=trunc_type)

    return padded


In [16]:
# Get padding

X_train_p = get_padded_data(tokenizer, X_train_u)
X_val_p = get_padded_data(tokenizer, X_val)
X_test_p = get_padded_data(tokenizer, X_test)

## Model

In [11]:
import tensorflow as tf

In [12]:
# Design
vocab_size = 10000
embedding_dim = 16
max_length = 700

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size , embedding_dim ,
        input_length = max_length ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation ="relu"),
    tf.keras.layers.Dense(1, activation ="sigmoid")
    ])

model.compile(
    loss="binary_crossentropy",
    optimizer ="adam", 
    metrics =["accuracy"])

In [17]:
num_epochs = 10
history = model.fit(X_train_p, y_train_u, epochs=num_epochs,
                    validation_data=(X_val_p, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
