Step 1. Preprocess dataset

In [None]:
# Install dependencies
%conda install pandas numpy tensorflow sklearn keras nltk keras-tuner

In [None]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras.api._v2.keras as keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
import keras_tuner as kt

#download packages from nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#read and format the csv in pandas
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=["sentiment", "ids", "date", "flag", "user", "text"])

In [None]:
#removing unneeded columns
df.drop('ids', inplace=True, axis=1)
df.drop('date', inplace=True, axis=1)
df.drop('flag', inplace=True, axis=1)
df.drop('user', inplace=True, axis=1)

In [None]:
#cleaning text
def caydranisabum():
    df.dropna(subset=['text'], inplace=True) #drop missing values
    df.drop_duplicates(subset=['text'], inplace=True) #drop duplicates
    df['text'] = df['text'].str.lower().str.replace('[^\w\s]', '', regex=True) #convert all chars to lowercase

caydranisabum()


In [None]:
#tokenisation of tweets using nltk
df['tokens'] = df['text'].apply(word_tokenize)

In [None]:
#removing stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
#stemming
stemmer = PorterStemmer()
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

In [None]:
#un-tokenising the preprocessed text
df['cleaned_text'] = df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
#remove unneeded columns
df.drop('text', inplace=True, axis=1)
df.drop('tokens', inplace=True, axis=1)

#the columns shld now be just 'sentiment' and 'cleaned-text'

In [None]:
#save the preprocessed texts if you want
df.to_csv('text_save_1.csv')

In [None]:
#csv to numpy arr (keras model less compatibility issues with arrays, pandas automatically converts csv elements to objects)
texts = df['cleaned_text'].values
labels = df['sentiment'].values

print('cleaned text',texts)
print('labels',labels)

Step 2: Set up our dataset & model for training

In [None]:
#make labels into bin, binary classification
labels = (labels > 2).astype(int)

print(labels)

In [None]:
#tokenise texts
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100)

#split into train/test
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
#building model (we are using the keras v2 sequential model with lstm layers)
from keras import layers
from keras_tuner import RandomSearch

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=20000, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=100))
    model.add(layers.LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32), return_sequences=True))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(units=hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))  #note: sigmoid for bin classification
    
    #USE LEGACY ADAM OPTIMISER FOR SILICON CHIPS, IF INTEL NO NEED LEGACY
    model.compile(optimizer=keras.optimizers.legacy.Adam(hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])),  #note: lowered learning rate from prev try
                  loss='binary_crossentropy',  #note binary not multiclass
                  metrics=['accuracy'])
    
    return model


Step 3: Training

In [None]:
#keras tuner to tune automatically (with adaptive movement optimiser to tweak hyperparems)
#4 trials of 10 epochs, chose to use 1 execution per trial only because according to previous tests we did, the executions all had identical val loss and accuracy
#it took ~1200 mins to run this!!!
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=4,
    executions_per_trial=1,
    directory='my_dir',
    project_name='sentiment_analysis_tuning_2'
)
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

#save the best model as hdf5 file, we chose to put it in the same cell as the training because we had a bug beforehand that caused us to lose all progress due to not being able to save the file after training
best_model = tuner.get_best_models(num_models=1)[0]
best_model.save('best.h5')

Step 4: Saving the model

In [None]:
#val loss & accuracy of this cool model
model = keras.models.load_model('best.h5')
model.evaluate(X_test, y_test)