<a href="https://colab.research.google.com/github/emilyliublair/Machine-Learning-Projects/blob/main/text_classification_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt


print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
#load, split, encode data
train_dataset = pd.read_csv(train_file_path, sep='\t')
test_dataset = pd.read_csv(test_file_path, sep='\t')

train_dataset.loc[-1] = [train_dataset.columns[0], train_dataset.columns[1]]
train_dataset.index = train_dataset.index + 1
train_dataset.sort_index(inplace=True)
test_dataset.loc[-1] = [test_dataset.columns[0], test_dataset.columns[1]]
test_dataset.index = test_dataset.index + 1
test_dataset.sort_index(inplace=True)

train_dataset = train_dataset.rename(columns={train_dataset.columns[0]:'indicator', train_dataset.columns[1]:'text'})
test_dataset = test_dataset.rename(columns={test_dataset.columns[0]:'indicator', test_dataset.columns[1]:'text'})
train_eval = train_dataset.pop('indicator')
test_eval = test_dataset.pop('indicator')

train_eval = train_eval.replace({'ham':0})
train_eval = train_eval.replace({'spam':1})
test_eval = test_eval.replace({'ham':0})
test_eval = test_eval.replace({'spam':1})

vocab = {}  
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ") 
  encoding = []  

  for word in words:
    if word in vocab:
      code = vocab[word]  
      encoding.append(code) 
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1
  
  return encoding

for x in range(0, train_dataset.shape[0]):
  encoding = one_hot_encoding(train_dataset.loc[x]['text'])

for x in range(0, test_dataset.shape[0]):
  encoding = one_hot_encoding(test_dataset.loc[x]['text'])

VOCAB_SIZE = len(vocab)

for x in range(0, train_dataset.shape[0]):
  i = train_dataset.loc[x]['text'].split(" ")
  num = 0
  list1 = []
  for y in i:
    i[num] = (vocab[y])
    list1.append(i[num])
    num+=1
  train_dataset.loc[x]['text'] = list1

for x in range(0, test_dataset.shape[0]):
  i = test_dataset.loc[x]['text'].split(" ")
  num = 0
  list1 = []
  for y in i:
    i[num] = (vocab[y])
    list1.append(i[num])
    num+=1
  test_dataset.loc[x]['text'] = list1

train_dataset = train_dataset.values.tolist()
num1=0
for x in train_dataset:
  y = train_dataset[num1][0]
  train_dataset[num1]= y
  num1+=1

test_dataset = test_dataset.values.tolist()
num2=0
for x in test_dataset:
  y = test_dataset[num2][0]
  test_dataset[num2]= y
  num2+=1

train_dataset = np.array(train_dataset)
test_dataset = np.array(test_dataset)
train_eval = np.array(train_eval)
test_eval = np.array(test_eval)
print(train_dataset)
print(test_dataset)
print(train_eval)
print(test_eval)

In [None]:
#preprocess data
VOCAB_SIZE = len(vocab)
train_dataset = keras.preprocessing.sequence.pad_sequences(train_dataset, 250)
test_dataset = keras.preprocessing.sequence.pad_sequences(test_dataset, 250)

In [None]:
#create model
model = tf.keras.Sequential([tf.keras.layers.Embedding(VOCAB_SIZE, 32), tf.keras.layers.LSTM(32), tf.keras.layers.Dense(1, activation='sigmoid')])

In [None]:
model.summary()

In [None]:
#train model
model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['acc'])
history = model.fit(train_dataset, train_eval, epochs=10)

In [None]:
#function for encoding text
def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [vocab[word] if word in vocab else 0 for word in tokens]
  return keras.preprocessing.sequence.pad_sequences([tokens], 250)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

In [None]:
#function for decoding numbers
reverse_vocab = {value:key for (key, value) in vocab.items()}

def decode_integers(integers):
  PAD = 0
  text=""
  for num in integers:
    if num!=PAD:
      text+=reverse_vocab[num] +" "
  return text[:-1]

print(decode_integers(encoded))



In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  encoded_text = encode_text(pred_text)
  pred = np.zeros((1,250))
  pred[0] = encoded_text
  result = model.predict(pred) 
  decision = ""
  prediction = result[0]
  if prediction <= .5:
    decision = "ham"
  else:
    decision = "spam"

  prediction = np.append(prediction, decision)
  return (prediction)

pred_text = "wow, is your arm alright. that happened to me one time too"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
