In [41]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
print("Dados Originais:")
display(X.describe())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=iris.feature_names)

print("Dados Normalizados:")
display(X_scaled_df.describe())

Dados Originais:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Dados Normalizados:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [None]:
def manual_standard_scaler(X):
    return (X - X.mean()) / X.std()

X_manual_scaled = manual_standard_scaler(X)
print("Normalização Manual:")
display(X_manual_scaled.describe())

print("Diferença com StandardScaler:")
display(X_scaled_df - X_manual_scaled)

Normalização Manual:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.515825e-15,-1.894781e-15,-1.515825e-15,-8.526513e-16
std,1.0,1.0,1.0,1.0
min,-1.86378,-2.42582,-1.562342,-1.442245
25%,-0.8976739,-0.5903951,-1.222456,-1.179859
50%,-0.05233076,-0.1315388,0.3353541,0.1320673
75%,0.672249,0.5567457,0.7602115,0.7880307
max,2.483699,3.080455,1.779869,1.706379


Diferença com StandardScaler:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.003007,0.003402,-0.004475,-0.004392
1,-0.003816,-0.000441,-0.004475,-0.004392
2,-0.004626,0.001097,-0.004665,-0.004392
3,-0.005030,0.000328,-0.004285,-0.004392
4,-0.003412,0.004171,-0.004475,-0.004392
...,...,...,...,...
145,0.003466,-0.000441,0.002737,0.004838
146,0.001848,-0.004284,0.002357,0.003079
147,0.002657,-0.000441,0.002737,0.003519
148,0.001443,0.002634,0.003116,0.004838


: 

: 

: 

: 

In [None]:
from sklearn.preprocessing import Normalizer

l2_normalizer = Normalizer(norm='l2')
X_l2 = l2_normalizer.fit_transform(X)
pd.DataFrame(X_l2, columns=iris.feature_names).head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.803773,0.551609,0.220644,0.031521
1,0.828133,0.50702,0.236609,0.033801
2,0.805333,0.548312,0.222752,0.034269
3,0.80003,0.539151,0.260879,0.034784
4,0.790965,0.569495,0.22147,0.031639


: 

: 

: 

: 

In [None]:
l1_normalizer = Normalizer(norm='l1')
X_l1 = l1_normalizer.fit_transform(X)
X_l1_df = pd.DataFrame(X_l1, columns=iris.feature_names)

print("Soma das colunas após L1:")
display(X_l1_df.sum())
X_l1_df.head()

Soma das colunas após L1:


sepal length (cm)    64.711626
sepal width (cm)     35.251681
petal length (cm)    38.294848
petal width (cm)     11.741845
dtype: float64

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.5,0.343137,0.137255,0.019608
1,0.515789,0.315789,0.147368,0.021053
2,0.5,0.340426,0.138298,0.021277
3,0.489362,0.329787,0.159574,0.021277
4,0.490196,0.352941,0.137255,0.019608


: 

: 

: 

: 

In [None]:
from sklearn.datasets import load_breast_cancer
import seaborn as sns
import matplotlib.pyplot as plt

cancer = load_breast_cancer()
df_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df_cancer['target'] = cancer.target

correlation = df_cancer.corr()['target'].drop('target').abs().sort_values(ascending=False)
selected_features_filter = correlation.head(10).index.tolist()

print("Top 10 features pela correlação:")
print(selected_features_filter)

Top 10 features pela correlação:
['worst concave points', 'worst perimeter', 'mean concave points', 'worst radius', 'mean perimeter', 'worst area', 'mean radius', 'mean area', 'mean concavity', 'worst concavity']


: 

: 

: 

: 

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

X = df_cancer.drop(columns='target')
y = df_cancer['target']

model = LogisticRegression(max_iter=10000)
rfe = RFE(model, n_features_to_select=10)
rfe.fit(X, y)

selected_features_wrapper = X.columns[rfe.support_].tolist()

print("Features selecionadas pelo Wrapper:")
print(selected_features_wrapper)

print("Comparação com filtragem:")
print("Em comum:", set(selected_features_filter) & set(selected_features_wrapper))

Features selecionadas pelo Wrapper:
['mean radius', 'mean compactness', 'mean concavity', 'texture error', 'worst radius', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry']
Comparação com filtragem:
Em comum: {'worst concave points', 'worst concavity', 'worst radius', 'mean concavity', 'mean radius'}


: 

: 

: 

: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
texto = '''A alma é, pois, imortal; renasceu repetidas vezes na existência e contemplou todas as coisas existentes e por isso não há nada que ela não conheça! 
Não é de espantar que ela seja capaz de evocar à memória a lembrança de objetos que viu anteriormente, e que se relacionam tanto com a virtude como com as outras coisas existentes. 
Toda a natureza, com efeito, é uma só, é um todo orgânico, e o espírito já viu todas as coisas; logo, nada impede que ao nos lembrarmos de uma coisa – o que nós, homens, chamamos de “saber” – todas as outras coisas acorram imediata e maquinalmente à nossa consciência.'''
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform([texto])
pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,acorram,alma,anteriormente,ao,as,capaz,chamamos,coisa,coisas,com,...,só,tanto,toda,todas,todo,um,uma,vezes,virtude,viu
0,1,1,1,1,4,1,1,1,4,3,...,1,1,1,3,1,1,2,1,1,2


: 

: 

: 

: 

In [None]:
for n in [1, 2, 3]:
    vectorizer_ng = CountVectorizer(ngram_range=(n, n))
    X_ng = vectorizer_ng.fit_transform([texto])
    print(f"{n}-grams:")
    print(len(vectorizer_ng.get_feature_names_out()))

1-grams:
63
2-grams:
82
3-grams:
86


: 

: 

: 

: 

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# Baixar os recursos necessários (só precisa rodar uma vez)
nltk.download('stopwords')

# Texto original
texto = """A alma é, pois, imortal; renasceu repetidas vezes na existência e contemplou todas as coisas existentes e por isso não há nada que ela não conheça! Não é de espantar que ela seja capaz de evocar à memória a lembrança de objetos que viu anteriormente, e que se relacionam tanto com a virtude como com as outras coisas existentes. Toda a natureza, com efeito, é uma só, é um todo orgânico, e o espírito já viu todas as coisas; logo, nada impede que ao nos lembrarmos de uma coisa – o que nós, homens, chamamos de “saber” – todas as outras coisas acorram imediata e maquinalmente à nossa consciência."""

# Tokenizador baseado em palavras (removendo pontuação)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(texto.lower())

# Stopwords em português
stop_words = set(stopwords.words('portuguese'))

# Filtrando as palavras que não são stopwords
filtered_words = [word for word in tokens if word not in stop_words]
print(filtered_words)

ModuleNotFoundError: No module named 'nltk'

: 

: 

: 

: 

In [None]:
import nltk

# Baixe os recursos necessários
nltk.download('punkt')
nltk.download('rslp')

: 

: 

: 

: 

In [None]:
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import RSLPStemmer

texto = "Os meninos estavam brincando no parquinho."

# Tokenização alternativa (funciona com português)
words = wordpunct_tokenize(texto)

# Aplica o stemmer
stemmer = RSLPStemmer()
stemmed = [stemmer.stem(word) for word in words]

print("Tokens:", words)
print("Stemmed:", stemmed)

: 

: 

: 

: 

In [None]:
import nltk
nltk.download('punkt')


: 

: 

: 

: 

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
print(lemmatized)

: 

: 

: 

: 