<a href="https://colab.research.google.com/github/chenoa23/NLP/blob/main/AI_NLP_Week_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##This notebook combines different machine learning and deep learning tasks using both pre-trained models and custom-built neural networks. It first uses the LLaMA 3.2 (3B) model from Hugging Face to generate text based on user prompts, and the LLaMA 3.2 Vision model (11B) to describe an image when given a picture and a question. It also includes two custom models built with TensorFlow. One predicts whether a wine is good based on its features using the Wine Quality dataset, and the other performs sentiment analysis on IMDB movie reviews to tell if a review is positive or negative. The notebook includes data cleaning, training, evaluation, and visualizations for each model. Overall, it shows how different types of data—text, images, and tables—can be handled using AI.

In [None]:
from getpass import getpass
HUGGINGFACE_TOKEN = getpass("Enter your Hugging Face token:")

In [None]:
!pip install "transformers>=4.45.0"
!pip install pillow  # For handling images with the Vision model

In [None]:
from transformers import pipeline
import torch

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
messages = [{"role": "user", "content": "Tell me about the history of Miami Dade College and who the current President is"}]
outputs = pipe(messages, max_new_tokens=150)
response = outputs[0]["generated_text"]
print(response)

In [None]:
import textwrap

# Extract the assistant's response text
response_content = outputs[0]["generated_text"][-1]["content"] if isinstance(outputs[0]["generated_text"], list) else outputs[0]["generated_text"]

# Define the wrap width
wrap_width = 70

# Print with formatted output
print("User:", messages[0]["content"])
print("\nAssistant:\n")
for line in response_content.split("\n"):
    print(textwrap.fill(line, width=wrap_width))

In [None]:
messages = [{"role": "user", "content": "Tell me about the mating habits of the African Honeybee"}]
outputs = pipe(messages, max_new_tokens=150)
response = outputs[0]["generated_text"]
print(response)

In [None]:
import textwrap

# Extract the assistant's response text
response_content = outputs[0]["generated_text"][-1]["content"] if isinstance(outputs[0]["generated_text"], list) else outputs[0]["generated_text"]

# Define the wrap width
wrap_width = 70

# Print with formatted output
print("User:", messages[0]["content"])
print("\nAssistant:\n")
for line in response_content.split("\n"):
    print(textwrap.fill(line, width=wrap_width))

In [None]:
import requests
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
processor = AutoProcessor.from_pretrained(vision_model_id)
model = MllamaForConditionalGeneration.from_pretrained(vision_model_id, torch_dtype=torch.bfloat16, use_auth_token=HUGGINGFACE_TOKEN)

In [None]:
# Example image URL
url = "https://miro.medium.com/v2/resize:fit:2400/1*nzdYUSs4c2RQs2W0FCHv1g.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

# Prepare the image input with a text query
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "Can you describe this image?"}
    ]}
]

input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=70)
print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))

In [None]:
import textwrap

# Decode and format the output
decoded_output = processor.decode(output[0][inputs["input_ids"].shape[-1]:])

# Define the wrap width
wrap_width = 70

# Print formatted output with text wrapping
print("\nFormatted Output:\n")
for line in decoded_output.split("\n"):
    print(textwrap.fill(line, width=wrap_width))

In [None]:
import numpy as np
import pandas as pd


df = pd.read_csv('https://raw.githubusercontent.com/fenago/datasets/refs/heads/main/winequalityN.csv')
df.sample(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
df.head()

In [None]:
df['is_white_wine'] = [
    1 if typ == 'white' else 0 for typ in df['type']]
df.drop('type', axis=1, inplace=True)
df.head()

In [None]:
df.sample(5)

In [None]:
df['is_good_wine'] = [
    1 if quality >= 6 else 0 for quality in df['quality']
]
df.drop('quality', axis=1, inplace=True)

df.head()

In [None]:
from sklearn.model_selection import train_test_split


X = df.drop('is_good_wine', axis=1)
y = df['is_good_wine']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Corrected optimizer parameter
model.compile(
    loss=tf.keras.losses.binary_crossentropy,
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.03),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

history = model.fit(X_train_scaled, y_train, epochs=100)


In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = (18, 8)
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

In [None]:
plt.plot(
    np.arange(1, 101),
    history.history['loss'], label='Loss'
)
plt.plot(
    np.arange(1, 101),
    history.history['accuracy'], label='Accuracy'
)
plt.plot(
    np.arange(1, 101),
    history.history['precision'], label='Precision'
)
plt.plot(
    np.arange(1, 101),
    history.history['recall'], label='Recall'
)
plt.title('Evaluation metrics', size=20)
plt.xlabel('Epoch', size=14)
plt.legend();

In [None]:
predictions = model.predict(X_test_scaled)

In [None]:
prediction_classes = [
    1 if prob > 0.5 else 0 for prob in np.ravel(predictions)
]

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, prediction_classes))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


print(f'Accuracy: {accuracy_score(y_test, prediction_classes):.2f}')
print(f'Precision: {precision_score(y_test, prediction_classes):.2f}')
print(f'Recall: {recall_score(y_test, prediction_classes):.2f}')

In [None]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/IMDB Dataset.csv')
df.sample(5)


In [None]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    return text.lower().strip()

# Clean the reviews
df['review'] = df['review'].apply(clean_text)

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded_sequences = pad_sequences(sequences, maxlen=200)


In [None]:
# Convert sentiment labels to binary
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Splitting the data into features (X) and labels (y)
X = padded_sequences
y = df['sentiment'].values


In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=200),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')


In [None]:
sample_reviews = [
    "I absolutely loved this movie! The plot was thrilling and the characters were so well developed.",
    "The film was a disaster. Poor acting and a predictable storyline."
]

sample_sequences = tokenizer.texts_to_sequences(sample_reviews)
sample_padded = pad_sequences(sample_sequences, maxlen=200)

predictions = model.predict(sample_padded)
print(["Positive" if prob > 0.5 else "Negative" for prob in predictions])
