# NLP Project - A.A. 2024/25

Authors:

- Gigante Davide (11018245)
- Puccia Niccolò (10829496)
- Sichili Giulio (11016179)
- Troiano Alessandro (10776474)

Link to the recording: AAAA

# WORKFLOW
Possiamo provare a classificare le subjects (che sono 3, più semplice), o i topics, che sono di più e tendono a sovrapporsi più spesso. Partiamo dalla prima.

PRIMA PARTE
- Carica i dati
- Manteniamo le colonne che ci servono per la classificazione
- Eseguiamo operazioni sui dati che servono per pulire il dataset qualunque modello useremo per la classificazione




# Multimodal Question Answering: ScienceQA

### Install Dependencies and libraries

In [1]:
!pip install -U datasets
!pip install --upgrade gensim



In [2]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import mode
import re
import string


import nltk
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import plotly.express as px

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
ds = load_dataset("derek-thomas/ScienceQA")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 12726
    })
    validation: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
    test: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
})

### Suppress Warnings

In [5]:
import warnings
warnings.filterwarnings("ignore")

# 1. Data Preprocessing

### Importing utilities

In [14]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True


### Cleaning utilities
Compiled objects live at module scope for maximum speed.



In [16]:
# Setup
_PUNCT_RE   = re.compile(f"[{re.escape(string.punctuation)}]")
_STOP_WORDS = set(stopwords.words("english"))
_LEMMATIZER = WordNetLemmatizer()
_POS_MAP    = {"J": wordnet.ADJ, "V": wordnet.VERB, "N": wordnet.NOUN, "R": wordnet.ADV}

def _wn_pos(tag: str):
    return _POS_MAP.get(tag[0], wordnet.NOUN)

def normalize_text(text: str) -> list[str]:
    clean = _PUNCT_RE.sub("", text.lower())
    tokens = [t for t in word_tokenize(clean) if t.isalpha() and t not in _STOP_WORDS]
    tagged = pos_tag(tokens)
    return [_LEMMATIZER.lemmatize(w, _wn_pos(p)) for w, p in tagged]

# Funzione batched per Huggingface
def process_batch(batch):
    batch["question"] = [normalize_text(q) for q in batch["question"]]
    batch["choices"] = [[normalize_text(c) for c in choice_list] for choice_list in batch["choices"]]
    return batch

### Take only the columns needed for classification

In [26]:
# Applica la funzione
columns_X = ["question", "choices"]
possible_y = ["subject", "topic"]
columns_needed = columns_X + possible_y
print(columns_needed)
ds_lemmatized = ds.map(
    process_batch,
    batched=True,
    remove_columns=[col for col in ds["train"].features if col not in columns_needed]
)

#remove the not needed columns anyway in the original ds
ds = ds.remove_columns([col for col in ds["train"].features if col not in columns_needed])

['question', 'choices', 'subject', 'topic']


## Word2Vec embedding if needed by the model

In [None]:
from gensim.models import KeyedVectors

embeddings = KeyedVectors.load("embeddings.kv")
#print embeddings size
print(embeddings.vectors.shape)