In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, SimpleRNN
from keras.models import Sequential
import pandas as pd
import numpy as np
#importing libraries

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

#mounting google drive

Mounted at /content/gdrive


In [3]:
data = pd.read_csv('gdrive/My Drive/data.csv')
#importing csv file

raw_text = data['message']

df_species = pd.get_dummies(data["species"])
#creating dummies of species for categorical classification

df_tail = np.array(data["tail"].apply(lambda x: (x=="yes")*1))
df_fingers = np.array(data["fingers"])
#cleaning data

y_train = np.array(df_species)*1


In [5]:
max_len=100
max_words=10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(raw_text)
sequences = tokenizer.texts_to_sequences(raw_text)
word_index = tokenizer.word_index
print(len(word_index))

data = pad_sequences(sequences, maxlen=max_len)

#Tokenizing raw text data for Model's use

636


In [6]:
x_train = data.T
x_train = np.vstack((x_train, df_tail))
x_train = np.vstack((x_train, df_fingers)).T

#Adding Tail and Fingers Data to Tokenised Text Data

In [7]:
print(x_train, x_train.shape)
print(y_train, y_train.shape)

[[  0   0   0 ...  91   0   4]
 [  0   0   0 ... 134   1   5]
 [  0   0   0 ...   6   1   5]
 ...
 [  0   0   0 ...  57   1   6]
 [  0   0   0 ... 635   1   2]
 [  0   0   0 ...   2   1   4]] (500, 102)
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (500, 10)


In [32]:
model=Sequential()
model.add(Embedding(10000, 1024))
model.add(LSTM(1024))
model.add(Dense(10, activation = 'sigmoid'))
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
#Creating a LSTM based Neural Network

In [33]:
history=model.fit(x_train, y_train, epochs=15, batch_size=8, validation_split=0.02)

#Training the model

Epoch 1/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - acc: 0.0842 - loss: 0.4247 - val_acc: 0.1000 - val_loss: 0.3772
Epoch 2/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - acc: 0.1249 - loss: 0.3333 - val_acc: 0.0000e+00 - val_loss: 0.3365
Epoch 3/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - acc: 0.1695 - loss: 0.3185 - val_acc: 0.3000 - val_loss: 0.3320
Epoch 4/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - acc: 0.2002 - loss: 0.3011 - val_acc: 0.1000 - val_loss: 0.3643
Epoch 5/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - acc: 0.2910 - loss: 0.2783 - val_acc: 0.4000 - val_loss: 0.2450
Epoch 6/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - acc: 0.3349 - loss: 0.2683 - val_acc: 0.5000 - val_loss: 0.2258
Epoch 7/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - acc

In [34]:
data = pd.read_csv('gdrive/My Drive/test.csv')

test_text = data['message']
df_tail = np.array(data["tail"].apply(lambda x: (x=="yes")*1))
df_fingers = np.array(data["fingers"])

#Importing the test data and cleaning


In [35]:
max_len=100
max_words=10000

sequences = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(sequences, maxlen=max_len)

#Tokenizing Raw test text



In [36]:
x_test = data.T
x_test = np.vstack((x_test, df_tail))
x_test = np.vstack((x_test, df_fingers)).T

#Adding Tail and Fingers Data to Tokenized Text Data

In [37]:
prediction = model.predict(x_test)
#Creating predctions based on test data

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


In [38]:
output = np.argmax(prediction, axis=1)
#Get highest probability class

In [15]:
species_headers = list(df_species)
species_headers

['Aquari',
 'Cybex',
 'Emotivor',
 'Faerix',
 'Florian',
 'Mythron',
 'Nexoon',
 'Quixnar',
 'Sentire',
 'Zorblax']

In [49]:
result = []
for key in output:
  result.append(species_headers[key])
result = pd.DataFrame(np.array(result))
print(result[0])

#Converting class indices to class names and saving in result array

result.to_csv("result.csv", index=False)
#Exporting the predictions

0        Aquari
1       Sentire
2       Florian
3        Faerix
4        Nexoon
         ...   
294     Mythron
295      Nexoon
296     Mythron
297     Quixnar
298    Emotivor
Name: 0, Length: 299, dtype: object
