# Questao 1

In [8]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

print("Carregando o dataset de reviews de filmes...")
reviews_train = load_files("aclImdb/train/", categories=['pos', 'neg'], encoding='utf-8', random_state=42)
X_text_train, y_train = reviews_train.data, reviews_train.target

reviews_test = load_files("aclImdb/test/", categories=['pos', 'neg'], encoding='utf-8', random_state=42)
X_text_test, y_test = reviews_test.data, reviews_test.target

print(f"Número de documentos de treino: {len(X_text_train)}")
print(f"Número de documentos de teste: {len(X_text_test)}")

pipeline = make_pipeline(
    TfidfVectorizer(min_df=5, stop_words="english", ngram_range=(1, 2)),
    LogisticRegression(max_iter=1000)
)

param_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

print("Iniciando GridSearchCV (pode levar algum tempo)...")
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_text_train, y_train)

print(f"\nMelhor pontuação de validação cruzada: {grid.best_score_:.2f}")
print(f"Melhores parâmetros: {grid.best_params_}")

test_score = grid.score(X_text_test, y_test)
print(f"Pontuação de teste final: {test_score:.2f}")

vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
feature_names = np.array(vectorizer.get_feature_names_out())
coef = grid.best_estimator_.named_steps["logisticregression"].coef_.ravel()
sorted_coef_indices = coef.argsort()

print("\nFeatures mais negativas (associadas a sentimentos negativos):")
print(feature_names[sorted_coef_indices[:20]])

print("\nFeatures mais positivas (associadas a sentimentos positivos):")
print(feature_names[sorted_coef_indices[::-1][:20]])


Carregando o dataset de reviews de filmes...
Número de documentos de treino: 20476
Número de documentos de teste: 19895
Iniciando GridSearchCV (pode levar algum tempo)...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Melhor pontuação de validação cruzada: 0.89
Melhores parâmetros: {'logisticregression__C': 10, 'tfidfvectorizer__ngram_range': (1, 3)}
Pontuação de teste final: 0.88

Features mais negativas (associadas a sentimentos negativos):
['worst' 'awful' 'bad' 'waste' 'boring' 'worse' 'poor' 'poorly'
 'disappointment' 'terrible' 'dull' 'horrible' 'fails' 'annoying'
 'disappointing' 'unfortunately' 'ridiculous' 'mess' 'waste time' 'save']

Features mais positivas (associadas a sentimentos positivos):
['great' 'excellent' 'perfect' 'best' 'wonderful' 'amazing' 'favorite'
 'today' 'superb' '10 10' 'fantastic' 'enjoyable' 'brilliant' 'loved'
 'entertaining' 'love' 'fun' 'refreshing' 'enjoyed' 'perfectly']


# Questao 2


In [5]:
X_train_tfidf = vectorizer.transform(X_text_train)
max_tfidf = X_train_tfidf.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_tfidf.argsort()[::-1]

print("\nTop 10 features com maiores valores de TF-IDF:")
for idx in sorted_by_tfidf[:10]:
    print(f"{feature_names[idx]}: {max_tfidf[idx]:.4f}")

print("\nTop 10 features com menores valores de TF-IDF:")
for idx in sorted_by_tfidf[-10:]:
    print(f"{feature_names[idx]}: {max_tfidf[idx]:.4f}")


Top 10 features com maiores valores de TF-IDF:
pokemon: 0.8838
doodlebops: 0.8533
casper: 0.8468
raj: 0.8114
dev: 0.8113
zizek: 0.8079
darkman: 0.8050
demons: 0.7955
joan: 0.7833
montand: 0.7782

Top 10 features com menores valores de TF-IDF:
enrico ratso rizzo: 0.0637
enrico ratso: 0.0637
scene stealing: 0.0627
video game movie: 0.0625
intercontinental: 0.0623
vampire cloak: 0.0621
literally br: 0.0589
literally br br: 0.0589
didn help br: 0.0582
stubbornly: 0.0574


# Questão 3:


In [14]:
from sklearn.preprocessing import Normalizer
from sklearn import datasets
import numpy as np


iris = datasets.load_iris()


Features Originais (primeiras 5 observações):
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

Features Normalizadas (L2, primeiras 5 observações):
 [[0.80377277 0.55160877 0.22064351 0.0315205 ]
 [0.82813287 0.50702013 0.23660939 0.03380134]
 [0.80533308 0.54831188 0.2227517  0.03426949]
 [0.80003025 0.53915082 0.26087943 0.03478392]
 [0.790965   0.5694948  0.2214702  0.0316386 ]]

Norma L2 da primeira observação normalizada: 12.247448713915889
