# Research Question

In [26]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import numpy as np

# Data Preparation

## Import Data

In [24]:
with open('data/amazon_cells_labelled.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    data = list(reader)

amazon = pd.DataFrame(data, columns=['sentence', 'label'])

amazon.head()

Unnamed: 0,sentence,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [29]:
# Tokenize training and testdata
tok = keras.preprocessing.text.Tokenizer()
tok.fit_on_texts(amazon['sentence'])
X_train = tok.texts_to_sequences(amazon['sentence'])
X_val = tok.texts_to_sequences(amazon['sentence'])

" ".join(map(str,X_train[0]))

# print lengths of the training and test sentences
lengths = [len(i) for i in X_train+X_val]
print(f'Max length of sentence: {max(lengths)}')
print(f'Average length of sentence: {np.mean(lengths)}')

Max length of sentence: 30
Average length of sentence: 10.29


In [31]:
# Pad sequences so each is the length of 10, the average
X_train = keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=10)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, padding='post', maxlen=10)

reverse_word_map = dict(map(reversed, tok.word_index.items()))

' '.join(reverse_word_map[i] for i in X_train[0] if i!=0) # exclude 0 due to padding

'here in the us unless i go by a converter'