In [2]:
import re
import numpy as np
import scipy
import itertools
import matplotlib
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from collections import Counter
from random import choice
import pandas as pd

test_data = pd.read_csv('/content/drive/MyDrive/poster_presentation/dataset/test.csv')
train_data = pd.read_csv('/content/drive/MyDrive/poster_presentation/dataset/train.csv')

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [4]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [5]:
%tensorflow_version 2.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
2.9673076980000133
GPU (s):
0.03925987599996006
GPU speedup over CPU: 75x


In [None]:
#preprocess
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import  stopwords
import string
import re
    
train_data['length'] = train_data['comment_text'].str.len()
train_data.head(10)

# Convert all messages to lower case
train_data['comment_text'] = train_data['comment_text'].str.lower()

# Replace email addresses with 'email'
train_data['comment_text'] = train_data['comment_text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
train_data['comment_text'] = train_data['comment_text'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
train_data['comment_text'] = train_data['comment_text'].str.replace(r'£|\$', 'dollars')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
train_data['comment_text'] = train_data['comment_text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')


# Replace numbers with 'numbr'
train_data['comment_text'] = train_data['comment_text'].str.replace(r'\d+(\.\d+)?', 'numbr')


train_data['comment_text'] = train_data['comment_text'].apply(lambda x: ' '.join(
    term for term in x.split() if term not in string.punctuation))

stop_words = set(stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])
train_data['comment_text'] = train_data['comment_text'].apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

lem=WordNetLemmatizer()
train_data['comment_text'] = train_data['comment_text'].apply(lambda x: ' '.join(
 lem.lemmatize(t) for t in x.split()))

train_data['clean_length'] = train_data.comment_text.str.len()

one_hot_labels=np.array(train_data.iloc[:,2:8])
one_hot_labels=np.array(train_data.iloc[:,2:8])
class1=np.array(train_data.iloc[:,2])
class2=np.array(train_data.iloc[:,3])
class3=np.array(train_data.iloc[:,4])
class4=np.array(train_data.iloc[:,5])
class5=np.array(train_data.iloc[:,6])
class6=np.array(train_data.iloc[:,7])

lang_tokenizer = tf.keras.preprocessing.text.Tokenizer()
lang_tokenizer.fit_on_texts(train_data['comment_text'])

## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
tensor = lang_tokenizer.texts_to_sequences(train_data['comment_text']) 
tensor=tf.keras.preprocessing.sequence.pad_sequences(tensor)

vocab_size=len(lang_tokenizer.word_counts.keys())
max_len=tensor.shape[-1]
num_classes=6

#data processing - create list of comment_texts
train_split = []
for row in train_data['comment_text']:
    train_split.append(row)


from numpy import array
import tensorflow
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten , Input
from tensorflow.keras.layers import Embedding,LSTM
from tensorflow.keras.models import Model


# define the model
model = Sequential()


input_shape=(max_len,)

inp=Input(input_shape)
emb=Embedding(vocab_size, 30)(inp)
lstm=LSTM(128,return_sequences=False)(emb)
dense=Dense(64)(lstm)
output_layers=[]

#for i in range (0,num_classes):
output_layers.append(Dense(1, activation='sigmoid',name='Malignant')(dense))
output_layers.append(Dense(1, activation='sigmoid',name='Highly_Malignant')(dense))
output_layers.append(Dense(1, activation='sigmoid',name='Rude')(dense))
output_layers.append(Dense(1, activation='sigmoid',name='Threat')(dense))
output_layers.append(Dense(1, activation='sigmoid',name='Abuse')(dense))
output_layers.append(Dense(1, activation='sigmoid',name='Loathe')(dense))

adam = tf.keras.optimizers.Adam(learning_rate=0.001)

model = Model (inputs=inp,outputs=output_layers)
model.compile(optimizer=adam, loss='categorical_crossentropy',
            metrics=['accuracy', tf.keras.metrics.Precision(), tensorflow.keras.metrics.Recall()])

# summarize the model
print(model.summary())
# fit the model
model.fit(tensor, [class1, class2, class3, class4, class5, class6], epochs=100,batch_size=1 ,verbose=1,validation_steps=None)
# evaluate the model
#loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
#print('Accuracy: %f' % (accuracy*100))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 1349)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 1349, 30)     5905140     ['input_4[0][0]']                
                                                                                                  
 lstm_3 (LSTM)                  (None, 128)          81408       ['embedding_3[0][0]']            
                                                                                                  
 dense_3 (Dense)                (None, 64)           8256        ['lstm_3[0][0]']                 
                                                                                            

# New section

In [None]:
model.save('IP40_model1.hdf5')

# New section