<a href="https://colab.research.google.com/github/cjflanagan/cs68/blob/master/tech68_day2_prepared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Concepts exploration

## TfidfVectorizer

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Example text data
documents = [
    "Hello, how are you?",
    "Winning isn't everything; it's the only thing.",
    "Today is a beautiful day."
]

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the documents
count_matrix = count_vectorizer.fit_transform(documents)

# Convert matrix to dataframe and print results
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
print("CountVectorizer output as DataFrame:")
print(count_df)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert matrix to dataframe and print results
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTfidfVectorizer output as DataFrame:")
print(tfidf_df)


CountVectorizer output as DataFrame:
   are  beautiful  day  everything  hello  how  is  isn  it  only  the  thing  \
0    1          0    0           0      1    1   0    0   0     0    0      0   
1    0          0    0           1      0    0   0    1   1     1    1      1   
2    0          1    1           0      0    0   1    0   0     0    0      0   

   today  winning  you  
0      0        0    1  
1      0        1    0  
2      1        0    0  

TfidfVectorizer output as DataFrame:
   are  beautiful  day  everything  hello  how   is       isn        it  \
0  0.5        0.0  0.0    0.000000    0.5  0.5  0.0  0.000000  0.000000   
1  0.0        0.0  0.0    0.377964    0.0  0.0  0.0  0.377964  0.377964   
2  0.0        0.5  0.5    0.000000    0.0  0.0  0.5  0.000000  0.000000   

       only       the     thing  today   winning  you  
0  0.000000  0.000000  0.000000    0.0  0.000000  0.5  
1  0.377964  0.377964  0.377964    0.0  0.377964  0.0  
2  0.000000  0.000000  0.000000

In [18]:
count_df

Unnamed: 0,are,beautiful,day,everything,hello,how,is,isn,it,only,the,thing,today,winning,you
0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,1,1,1,1,1,0,1,0
2,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0


In [19]:
tfidf_df

Unnamed: 0,are,beautiful,day,everything,hello,how,is,isn,it,only,the,thing,today,winning,you
0,0.5,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
1,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.377964,0.377964,0.377964,0.377964,0.377964,0.0,0.377964,0.0
2,0.0,0.5,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0


# The world before LLM's

## Sentiment Classifier


In [20]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn import metrics

# Load the data
data = pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/stock_data_nlp.csv')
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [21]:

# Split the data into training, validation, and testing sets
# Here, 60% of the data is used as the training set, 20% as the validation set, and 20% as the test set
X_train, X_temp, y_train, y_temp = train_test_split(data['Text'], data['Sentiment'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [22]:
X_train.head()

Unnamed: 0,Text
1088,goog short 738. Think they will sell into the ...
3167,AMI taking more off 820
4353,BBY <<< Is this device the next AAP - ave revi...
112,ZCS user There's your buying
2311,ike it or Not AAP will fill the gap B4 will s...


In [23]:
y_train.head()

Unnamed: 0,Sentiment
1088,0
3167,1
4353,1
112,1
2311,0


In [24]:
X_val.head()

Unnamed: 0,Text
30,AAP VOME today is impressive. At this rate and...
3263,NSPH peeled more off 2.19
5234,"Well-wishes: Mr. Hotze��������s caps, hot-sell..."
2820,CS - To those that doubted me; today is you're...
1313,TPX - easy short on next move higher to 50 - j...


In [25]:
y_val.head()

Unnamed: 0,Sentiment
30,1
3263,1
5234,0
2820,0
1313,0


In [26]:

# Create a pipeline that includes TF-IDF vectorization followed by logistic regression
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    RandomForestClassifier()
)

# Train the model on the training data
pipeline.fit(X_train, y_train)


In [27]:
pipeline.predict_proba(["TSLA IS A LONG"])

array([[0., 1.]])

In [28]:

# Predict the sentiment on the validation and test data
y_val_pred = pipeline.predict(X_val)
y_test_pred = pipeline.predict(X_test)

# Evaluate the model's performance on the validation set
val_report = classification_report(y_val, y_val_pred)
print('Validation Classification Report:')
print(val_report)

fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred)
print('AUC: ', metrics.auc(fpr, tpr))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       422
           1       0.79      0.86      0.83       736

    accuracy                           0.77      1158
   macro avg       0.75      0.73      0.74      1158
weighted avg       0.76      0.77      0.76      1158

AUC:  0.7340240572841541


In [29]:
X_val.head()

Unnamed: 0,Text
30,AAP VOME today is impressive. At this rate and...
3263,NSPH peeled more off 2.19
5234,"Well-wishes: Mr. Hotze��������s caps, hot-sell..."
2820,CS - To those that doubted me; today is you're...
1313,TPX - easy short on next move higher to 50 - j...


In [30]:
y_test_pred[0:5]

array([1, 1, 1, 1, 0])

## Movies Recommender

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
movies_data = pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/movies_metadata.csv')

# Filter out rows where the overview is missing
movies_data = movies_data.dropna(subset=['overview'])
movies_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [32]:

# Text preprocessing function to clean the overview text
def preprocess_text(text):
    """ Convert text to lowercase and strip spaces """
    return text.lower().strip()

# Apply preprocessing to the 'overview' column
movies_data['processed_overview'] = movies_data['overview'].apply(preprocess_text)

# Generating embeddings using TfidfVectorizer
# This converts text to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
matrix = vectorizer.fit_transform(movies_data['processed_overview'])
matrix_df = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Function to recommend movies based on cosine similarity of TF-IDF embeddings
def recommend_movies(movie_title, num_recommendations=5):
    """ Recommend movies based on the cosine similarity of movie overviews """
    # Check if the movie title is in our dataset
    if movie_title not in movies_data['original_title'].values:
        return "Movie not found in the dataset."

    # Find the index of the movie that matches the title
    idx = movies_data.index[movies_data['original_title'] == movie_title].tolist()[0]

    # Compute the cosine similarity matrix for the selected movie against all others
    cosine_similarities = linear_kernel(matrix[idx:idx+1], matrix)

    # Get the scores of all movies with the given movie
    similarity_scores = list(enumerate(cosine_similarities[0]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar movies
    similar_movies = similarity_scores[1:num_recommendations+1]

    # Get the movie titles based on the indices of the top similar movies
    movie_indices = [i[0] for i in similar_movies]
    recommended_movies = movies_data['original_title'].iloc[movie_indices].tolist()

    return recommended_movies

# Example usage
recommended_movies = recommend_movies("Toy Story", num_recommendations=5)
print("Recommended Movies:", recommended_movies)


Recommended Movies: ['Toy Story 3', 'Toy Story 2', 'The 40 Year Old Virgin', 'The Champ', "Andy Hardy's Blonde Trouble"]


# The world after LLM's

### Calling OpenAI

In [33]:
!pip install openai



In [34]:
from openai import OpenAI
from google.colab import userdata
openai_api_key = userdata.get('open_ai_key')

client = OpenAI(api_key=openai_api_key)

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a movie recommender"},
    {"role": "user", "content": "Recommend other movies I would like if I liked Toy Story. Return a list"}
  ]
)

res = completion.choices[0].message
print(res.content)

If you liked Toy Story, you might enjoy these movies:

1. Finding Nemo
2. Monsters, Inc.
3. The Incredibles
4. Ratatouille
5. WALL-E
6. Up
7. Inside Out
8. Coco
9. Toy Story 2
10. Toy Story 3

These movies are all from Pixar, known for their heartwarming and visually stunning animated films that appeal to both children and adults. Enjoy!


### Embeddings

In [35]:
!pip install -U sentence-transformers



In [36]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can use other models too

# Define the sentence to embed
sentence = "Stanford"

# Generate the embedding
embedding = model.encode(sentence)

# Print the embedding vector
print("Embedding for the input sentence:")
print(embedding)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding for the input sentence:
[ 2.47305725e-02 -9.62590128e-02  5.07580899e-02  1.49931423e-02
  2.61617340e-02  1.53946551e-02 -2.30589672e-03  2.58221403e-02
  1.60300564e-02  4.25988846e-02  4.48090164e-03 -1.33210877e-02
 -2.01715273e-03 -5.82050183e-04 -7.44248629e-02  3.25593129e-02
 -2.17369013e-02 -6.13218546e-02  2.80339178e-02 -1.48286432e-01
 -4.96502779e-02  4.27035391e-02 -2.75227008e-04 -2.21115146e-02
  6.24964647e-02  3.77720147e-02  2.79503944e-03 -8.65437742e-03
 -1.73164569e-02 -6.08642511e-02 -2.38454249e-02 -2.78559551e-02
  8.17170665e-02  1.30890440e-02 -5.87220956e-03 -3.96352373e-02
  3.32098715e-02 -2.03353725e-02  7.57966042e-02  5.17529808e-02
 -6.29257336e-02  4.33194498e-03  9.75579321e-02  4.70113568e-02
 -2.24213898e-02  8.31764750e-03  3.61824594e-02 -6.73840865e-02
  7.79713616e-02  4.63298485e-02 -6.52237236e-02 -2.28576232e-02
 -9.80273560e-02  1.51841370e-02  1.91797502e-03  1.10040300e-01
  6.21325858e-02  4.66217175e-02 -5.92488460e-02  1.7053

## Sentiment

In [37]:
import json

In [43]:
def classify_tweet(message):
    """
    Send a message to the OpenAI GPT-3.5 model and return its response.

    This function interacts with the OpenAI API, specifically using the GPT-3.5-turbo model. It takes a user's message as input, sends it to the model, and returns the model's text-only response. The function ensures the AI's output is concise by providing a system-level instruction.

    Parameters:
    message (str): A string containing the user's message to the AI.

    Returns:
    str: The text response generated by the GPT-3.5 model.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo", #"gpt-4-turbo-preview",
        response_format={ "type": "json_object" },
        logprobs=True,
        messages=[
            {"role": "system", "content": """
            You are a helpful assistant doing classifations.

            Below are examples of text messages and their classifications. After studying these examples, please classify the new text message at the end.

              Example 1:
              Tweet: "Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ trade, market. bullish!"
              Sentiment: 1

              Example 2:
              Tweet: "user: AAP MOVIE. 55% return for the FEA/GEED indicator just awesome."
              Sentiment: 1

              Example 3:
              Tweet: "In this economy, I am just not sure if investing is a good idea right now."
              Sentiment: 0

              Example 4:
              Tweet: "The new product launch was a flop. Had higher expectations from this company."
              Sentiment: 0

              """},
            {"role": "user", "content": f"Classify the following message as 1 or 0 and return JSON with the probability of sentiment being 1 in a key called sentiment_prob: {message}"}
        ]
    )
    text_only = response.choices[0].message.content
    print(message, text_only)
    json_object = json.loads(text_only)
    # Ensure sentiment_prob is a float
    classification = float(json_object["sentiment_prob"])
    return classification

In [44]:
classify_tweet("TSLA IS A LONG!")

TSLA IS A LONG! {
    "classification": 1,
    "sentiment_prob": 0.85
}


0.85

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import openai
from google.colab import userdata

# Set your OpenAI API key
openai_api_key = userdata.get('open_ai_key')

# Load the data
data = pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/stock_data_nlp.csv')
data = data.sample(100)

# Classify tweets in the test set
data['Predicted_Probability'] = data['Text'].apply(classify_tweet)

# Compute the AUC score
auc_score = roc_auc_score(data['Sentiment'], data['Predicted_Probability'])
print(f"AUC Score: {auc_score}")



ove GNW!! 9-13 Calls are making me feel better about myself for not dumping AAP sooner! {
    "sentiment": 1,
    "sentiment_prob": 0.85
}
ES,SPY  You wont here this on CNBC,,,real time assessment of jobs number mkt reactions...   {
    "sentiment_prob": 1
}
NAN like the daily and intraday setup, will try over 18.80 {
    "classification": 1,
    "sentiment_prob": 0.75
}
user: AAP back to 450 i guess for a pin {
    "sentiment_prob": 1
}
DDD SSY Stops honored.  So be it. Now.... must.not.become.bearshitter. {
    "sentiment_prob": 0.2
}
AMD Chooo Choo pop to bankruptcy? {
    "sentiment": 0,
    "sentiment_prob": 0.15
}
GEOY Should be trading at: 41.09 as each share worth 1.425 shares of DGI (28.84) x 1.425 = 41.09: {
    "classification": 1,
    "sentiment_prob": 0.8
}
JCP Cash-flow distress That doesn't sound very good... BS downgrade to sell {
  "classification": 0,
  "sentiment_prob": 0.15
}
AAP and GOOG should respond positively to good jobless claims numbers {
    "classification

In [46]:
# Convert probabilities to labels for the classification report
data['Predicted_Sentiment'] = data['Predicted_Probability'].apply(lambda x: 1 if x > 0.5 else 0)

# Generate classification report
report = classification_report(data['Sentiment'], data['Predicted_Sentiment'])
print(report)


              precision    recall  f1-score   support

           0       0.85      0.50      0.63        44
           1       0.70      0.93      0.80        56

    accuracy                           0.74       100
   macro avg       0.77      0.71      0.71       100
weighted avg       0.77      0.74      0.72       100



In [47]:
print(f"AUC Score: {auc_score}")


AUC Score: 0.716112012987013


## Recommenders

In [48]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
movies_data = pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/movies_metadata.csv')

# Filter out rows where the overview is missing
movies_data = movies_data.dropna(subset=['overview'])

# Text preprocessing function to clean the overview text
def preprocess_text(text):
    """ Convert text to lowercase and strip spaces """
    return text.lower().strip()

# Apply preprocessing to the 'overview' column
movies_data['processed_overview'] = movies_data['overview'].apply(preprocess_text)

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generating embeddings using SentenceTransformer
embeddings = model.encode(movies_data['processed_overview'].tolist(), show_progress_bar=True)

# Function to recommend movies based on cosine similarity of embeddings
def recommend_movies(movie_title, num_recommendations=5):
    """ Recommend movies based on the cosine similarity of movie overviews """
    # Check if the movie title is in our dataset
    if movie_title not in movies_data['original_title'].values:
        return "Movie not found in the dataset."

    # Find the index of the movie that matches the title
    idx = movies_data.index[movies_data['original_title'] == movie_title].tolist()[0]

    # Compute the cosine similarity matrix for the selected movie against all others
    cosine_similarities = cosine_similarity([embeddings[idx]], embeddings)

    # Get the scores of all movies with the given movie
    similarity_scores = list(enumerate(cosine_similarities[0]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar movies
    similar_movies = similarity_scores[1:num_recommendations+1]

    # Get the movie titles based on the indices of the top similar movies
    movie_indices = [i[0] for i in similar_movies]
    recommended_movies = movies_data['original_title'].iloc[movie_indices].tolist()

    return recommended_movies

# Example usage
recommended_movies = recommend_movies("Toy Story", num_recommendations=5)
print("Recommended Movies:", recommended_movies)


Batches:   0%|          | 0/969 [00:00<?, ?it/s]

Recommended Movies: ['Toy Story 3', 'Toy Story 2', "Child's Play 3", 'Firestarter', 'Life Begins for Andy Hardy']


# Neural networks

In [49]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Load data
data = pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/stock_data_nlp.csv')
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [50]:
# Data preprocessing
vocab_size = 10000  # Size of the vocabulary to use
# Choosing a vocab size of 10,000 covers most of the commonly used words in the dataset, providing a balance
# between model complexity and computational efficiency. It helps in capturing sufficient detail in the text data.

max_length = 50  # Maximum length of input sequences
# The maximum length is set to 50 words, ensuring that most of the texts are fully included without much truncation.
# This length is chosen based on typical sentence lengths in the data, optimizing the balance between information
# retention and computational efficiency.

# Tokenizer configuration
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Text'])  # This builds the word index
sequences = tokenizer.texts_to_sequences(data['Text'])  # Converts strings in 'Text' to lists of integers.
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Split the data into training and testing sets (80/20 split)
split = int(len(padded_sequences) * 0.8)
train_sequences, test_sequences = padded_sequences[:split], padded_sequences[split:]
train_labels, test_labels = data['Sentiment'][:split], data['Sentiment'][split:]

# Model creation
model = Sequential([
    # Embedding layer: maps each word to a fixed-size vector of embeddings.
    # 'vocab_size': the size of the vocabulary in the text data.
    # '16': dimensionality of the embedding vectors.
    # 'input_length': the length of input sequences that the network expects.
    Embedding(vocab_size, 16, input_length=max_length),

    # GlobalAveragePooling1D: reduces each sequence of embeddings to a single average vector.
    # This simplifies the model by reducing the total number of parameters.
    GlobalAveragePooling1D(),

    # Dense layer: a fully connected layer that learns non-linear combinations of the high-level features
    # extracted by the network earlier.
    # '24' units: the number of neurons in this layer.
    # 'activation': the activation function to use (ReLU in this case).
    Dense(24, activation='relu'),

    # Output layer: a single neuron with a sigmoid activation.
    # Outputs a value between 0 and 1 that represents a probability.
    Dense(1, activation='sigmoid')
])

# Compile the model with binary crossentropy loss and the Adam optimizer.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

# Generate predictions for the test data
predictions = model.predict(test_sequences)


# Calculate AUC score
auc_score = roc_auc_score(test_labels, predictions)
print(f"AUC Score: {auc_score}")

# Convert probabilities to binary predictions for classification report
predictions = [1 if p > 0.8 else 0 for p in predictions]

# Print classification report
print(classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))

# The classification report provides detailed performance metrics for each class, including:
# - Precision: proportion of positive identifications that were actually correct.
# - Recall: proportion of actual positives that were identified correctly.
# - F1-score: a weighted average of precision and recall.
# - Support: the number of actual occurrences of the class in the specified dataset.


Epoch 1/10




[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.6686 - loss: 0.6480 - val_accuracy: 0.4754 - val_loss: 0.7622
Epoch 2/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6777 - loss: 0.6270 - val_accuracy: 0.4754 - val_loss: 0.7627
Epoch 3/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6781 - loss: 0.6186 - val_accuracy: 0.4754 - val_loss: 0.7674
Epoch 4/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6783 - loss: 0.5987 - val_accuracy: 0.4763 - val_loss: 0.7460
Epoch 5/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6882 - loss: 0.5515 - val_accuracy: 0.4892 - val_loss: 0.7347
Epoch 6/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7740 - loss: 0.4686 - val_accuracy: 0.5272 - val_loss: 0.7090
Epoch 7/10
[1m145/145[0m [32m━━━━━━