## Imports

In [86]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


### Helper functions and constants

In [87]:
STOPWORDS = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]


def readCSV(filename):
    """Reads a CSV file and returns a pandas DataFrame"""

    try:
        df = pd.read_csv(filename)
        return df

    except FileNotFoundError:
        print("File not found!")

## Read data

In [88]:
fn = "training_data/finDataset.csv"
finDf = readCSV(fn)

In [89]:
finDf

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


## Preprocessing Data
Before feeding the data into a neural network, it often needs to be preprocessed:

- Clean the Text: Remove unnecessary characters, convert to lowercase, and possibly tokenize the sentences.
- Split the Data: Divide your data into training and test sets.
- Encode Labels: If your sentiment labels are textual (like 'positive', 'negative'), encode them into numerical format.
- Vectorize Data: Transform each text into a sequence of integers corresponding to the indices of individual tokens in a dictionary.

In [90]:
def removeStopwords(sentence):
    """Removes stopwords from a sentence"""
    splitSentence = sentence.lower().split()
    splitSentence = [word for word in splitSentence if word not in STOPWORDS]
    processedSentence = " ".join(splitSentence)

    return processedSentence

def cleanData(df):
    """Cleans the data by removing stopwords and NaN values"""

    df.dropna(inplace=True)
    df["Sentence"] = df["Sentence"].apply(removeStopwords)

    return df

def splitData(df):
    """Splits the data into training and testing data"""

    trainData, testData, trainLabels, testLabels = train_test_split(df["Sentence"], df["Sentiment"], test_size=0.2, stratify=df["Sentiment"])
    
    return trainData, testData, trainLabels, testLabels

def vectorizeData(trainData, testData):
    """Tokenizes the data"""
    
    vectorizer = TfidfVectorizer()
    
    trainData_vectorized = vectorizer.fit_transform(trainData)
    testData_vectorized = vectorizer.transform(testData)
    
    return trainData_vectorized, testData_vectorized, len(vectorizer.get_feature_names_out())
    

In [91]:
finDf = cleanData(finDf)
trainData, testData, trainLabels, testLabels = splitData(finDf)
trainData_vectorized, testData_vectorized, INPUT_DIM = vectorizeData(trainData, testData)

## Define the Neural Network Model
For sentiment analysis, a common approach is to use a recurrent neural network (RNN) with Long Short-Term Memory (LSTM) or a Convolutional Neural Network (CNN). Frameworks like TensorFlow or PyTorch can be used for this.

## Compile the Model
Choose an optimizer (like Adam), a loss function (like binary crossentropy for binary classification), and metrics (like accuracy).

In [92]:
def create_lstm_model(input_dim, output_dim, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=max_sequence_length))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    
    # (positive, neutral, negative) - 3 neurons
    model.add(Dense(units=3, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_cnn_model(input_dim, output_dim, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(units=10, activation='relu'))
    model.add(Dropout(0.2))
    
    # (positive, neutral, negative) - 3 neurons
    model.add(Dense(units=3, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [93]:
# create LSTM model
model_lstm = create_lstm_model(input_dim=trainData_vectorized.shape[1], output_dim=128, max_sequence_length=trainData_vectorized.shape[1])
model_lstm.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 10160, 128)        1300480   
                                                                 
 lstm_10 (LSTM)              (None, 10160, 50)         35800     
                                                                 
 dropout_14 (Dropout)        (None, 10160, 50)         0         
                                                                 
 lstm_11 (LSTM)              (None, 50)                20200     
                                                                 
 dropout_15 (Dropout)        (None, 50)                0         
                                                                 
 dense_13 (Dense)            (None, 3)                 153       
                                                                 
Total params: 1356633 (5.18 MB)
Trainable params: 1356

In [94]:
# create CNN model
model_cnn = create_cnn_model(input_dim=trainData_vectorized.shape[1], output_dim=128, max_sequence_length=trainData_vectorized.shape[1])
model_cnn.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 10160, 128)        1300480   
                                                                 
 conv1d_4 (Conv1D)           (None, 10156, 128)        82048     
                                                                 
 global_max_pooling1d_4 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_14 (Dense)            (None, 10)                1290      
                                                                 
 dropout_16 (Dropout)        (None, 10)                0         
                                                                 
 dense_15 (Dense)            (None, 3)                 33        
                                                     

## Train the Model
Feed your training data into the model. This process involves tuning hyperparameters like batch size and the number of epochs.

In [95]:
trainData_vectorized = trainData_vectorized.toarray()
testData_vectorized = testData_vectorized.toarray()

# encode labels
encoder = LabelEncoder()
trainLabels_encoded = encoder.fit_transform(trainLabels)
testLabels_encoded = encoder.transform(testLabels)

# one-hot encode labels
trainLabels_oneHot = to_categorical(trainLabels_encoded)
testLabels_oneHot = to_categorical(testLabels_encoded)

In [96]:
EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2

# train LSTM model
lstm_model = create_lstm_model(input_dim=INPUT_DIM, output_dim=128, max_sequence_length=trainData_vectorized.shape[1])

history_lstm = lstm_model.fit(trainData_vectorized, trainLabels_oneHot, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)

# train CNN model
cnn_model = create_cnn_model(input_dim=INPUT_DIM, output_dim=128, max_sequence_length=trainData_vectorized.shape[1])

history_cnn = cnn_model.fit(trainData_vectorized, trainLabels_oneHot, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)


Epoch 1/10
 19/117 [===>..........................] - ETA: 15:37 - loss: 1.0233 - accuracy: 0.5115

## Evaluate the Model
Test the model on your test set to see how well it generalizes to new data.

In [None]:
# evaluate LSTM model
test_loss, test_accuracy = lstm_model.evaluate(testData_vectorized, testLabels_oneHot)
print("Test Accuracy:", test_accuracy)

# plot
plt.plot(history_lstm.history['accuracy'])

In [None]:
# evaluate CNN model
test_loss, test_accuracy = cnn_model.evaluate(testData_vectorized, testLabels_oneHot)
print("Test Accuracy:", test_accuracy)

# plot
plt.plot(history_cnn.history['accuracy'])