## 1. Configuraci√≥n e Imports

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Agregar src al path
project_dir = Path.cwd().parent

# Agregar src al path expl√≠citamente para imports internos
src_dir = project_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
if str(project_dir) not in sys.path:
    sys.path.append(str(project_dir))

from src.data_loading import load_processed_data, save_object
from src.features import create_tfidf_features, get_top_tfidf_words
from scipy.sparse import hstack, csr_matrix
import os

print('‚úì Librer√≠as y m√≥dulos cargados')

‚úì Librer√≠as y m√≥dulos cargados


## 2. Cargar Datos Procesados

In [2]:
# Cargar datos procesados usando src
df_train = load_processed_data('train_processed.csv')
df_test = load_processed_data('test_processed.csv')

print(f"Train: {df_train.shape}")
print(f"Test: {df_test.shape}")
print(f"\nColumnas: {list(df_train.columns)}")

‚úì Datos procesados cargados: 1596781 filas
‚úì Datos procesados cargados: 359 filas
Train: (1596781, 11)
Test: (359, 11)

Columnas: ['polarity', 'text', 'text_clean', 'length', 'num_words', 'num_hashtags', 'num_mentions', 'num_urls', 'num_uppercase', 'pct_uppercase', 'num_intensified']


## 3. Configurar Stopwords Personalizadas

**Decisi√≥n del EDA:** Mantener palabras que aportan sentimiento como "not", "no", "very"

In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Palabras que S√ç aportan sentimiento (las sacamos de stopwords)
sentiment_words = {
    'not', 'no', 'nor', 'neither', 'never', 'none', 'nobody', 'nothing', 'nowhere',
    'very', 'really', 'so', 'too', 'quite', 'rather',
    'but', 'however', 'although', 'though',
    'all', 'every', 'any', 'some', 'most',
    'much', 'many', 'more', 'most', 'less', 'least',
    'good', 'bad', 'best', 'worst', 'better', 'worse'
}

# Stopwords finales = stopwords default - palabras de sentimiento
custom_stopwords = ENGLISH_STOP_WORDS - sentiment_words

print(f"Stopwords originales: {len(ENGLISH_STOP_WORDS)}")
print(f"Stopwords personalizadas: {len(custom_stopwords)}")
print(f"Palabras de sentimiento conservadas: {len(sentiment_words)}")
print(f"\nEjemplos conservados: {list(sentiment_words)[:10]}")

Stopwords originales: 318
Stopwords personalizadas: 291
Palabras de sentimiento conservadas: 35

Ejemplos conservados: ['neither', 'best', 'really', 'some', 'every', 'nowhere', 'no', 'nor', 'though', 'however']


## 4. Crear Vectorizador TF-IDF con Bigramas

**Configuraci√≥n:**
- **ngram_range=(1,2)**: palabras individuales + bigramas
- **max_features=5000**: limitar vocabulario
- **min_df=5**: palabra debe aparecer al menos 5 veces
- **max_df=0.8**: palabra no puede estar en m√°s del 80% de tweets

In [4]:
# La configuraci√≥n del vectorizador se pasa directamente a la funci√≥n create_tfidf_features de src

## 5. Entrenar TF-IDF en TRAIN

In [5]:
# Crear features TF-IDF usando src
# Esta funci√≥n entrena en train y transforma test autom√°ticamente
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = create_tfidf_features(
    train_texts=df_train['text_clean'].fillna(''),
    test_texts=df_test['text_clean'].fillna(''),
    max_features=10000,
    ngram_range=(1, 2),
    stop_words=list(custom_stopwords),
    min_df=5,
    max_df=0.8
)

print(f"\nShape Train: {X_train_tfidf.shape}")
print(f"Shape Test: {X_test_tfidf.shape}")


Shape Train: (1596781, 10000)
Shape Test: (359, 10000)


## 6. Transformar TEST

**Nota**: La transformaci√≥n de TEST ya se realiz√≥ autom√°ticamente en la celda anterior 
mediante `create_tfidf_features()`, que entrena en TRAIN y transforma TEST sin data leakage.

## 7. Inspeccionar Vocabulario Aprendido

Verificamos que incluya bigramas importantes como "not good", "very happy"

In [6]:
# Obtener t√©rminos y sus √≠ndices
feature_names = tfidf_vectorizer.get_feature_names_out()

# Separar unigramas y bigramas
unigramas = [term for term in feature_names if ' ' not in term]
bigramas = [term for term in feature_names if ' ' in term]

print(f"=== VOCABULARIO ===")
print(f"\nTotal t√©rminos: {len(feature_names)}")
print(f"  - Unigramas: {len(unigramas)}")
print(f"  - Bigramas: {len(bigramas)}")

print(f"\nüìù Ejemplos de bigramas importantes:")
sentiment_bigrams = [b for b in bigramas if any(word in b for word in ['not', 'no', 'very', 'really', 'so', 'too'])]
print(sentiment_bigrams[:20])

=== VOCABULARIO ===

Total t√©rminos: 10000
  - Unigramas: 6121
  - Bigramas: 3879

üìù Ejemplos de bigramas importantes:
['absolutely love', 'absolutely nothing', 'actually really', 'all annoying', 'all know', 'all not', 'all really', 'all so', 'all songs', 'all too', 'all very', 'amazing so', 'amp no', 'amp not', 'amp so', 'amp some', 'annoying help', 'anytime soon', 'ate some', 'ate too']


In [7]:
# Verificar bigramas cr√≠ticos de sentimiento
critical_bigrams = ['not good', 'not bad', 'very happy', 'very sad', 
                    'dont like', 'cant wait', 'so sad', 'so happy']

print("\nüîç Verificando bigramas cr√≠ticos:")
for bigram in critical_bigrams:
    if bigram in tfidf_vectorizer.vocabulary_:
        print(f"  ‚úì '{bigram}' est√° en el vocabulario")
    else:
        print(f"  ‚úó '{bigram}' NO est√° en el vocabulario")


üîç Verificando bigramas cr√≠ticos:
  ‚úì 'not good' est√° en el vocabulario
  ‚úì 'not bad' est√° en el vocabulario
  ‚úì 'very happy' est√° en el vocabulario
  ‚úì 'very sad' est√° en el vocabulario
  ‚úì 'dont like' est√° en el vocabulario
  ‚úó 'cant wait' NO est√° en el vocabulario
  ‚úì 'so sad' est√° en el vocabulario
  ‚úì 'so happy' est√° en el vocabulario


## 7.1 Top Features por Clase usando TF-IDF

Analizamos qu√© t√©rminos son m√°s relevantes para cada clase (Positivo vs Negativo).

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Calcular promedios de TF-IDF por clase
positive_tweets = df_train[df_train['polarity'] == 4]['text_clean'].fillna('')
negative_tweets = df_train[df_train['polarity'] == 0]['text_clean'].fillna('')

# Entrenar vectorizadores separados para an√°lisis exploratorio
print("Analizando top features por clase...")
vec_pos = TfidfVectorizer(max_features=20, ngram_range=(1,2), stop_words=list(custom_stopwords))
vec_neg = TfidfVectorizer(max_features=20, ngram_range=(1,2), stop_words=list(custom_stopwords))

X_pos = vec_pos.fit_transform(positive_tweets)
X_neg = vec_neg.fit_transform(negative_tweets)

print("üü¢ TOP 10 T√âRMINOS EN TWEETS POSITIVOS:")
for term in list(vec_pos.vocabulary_.keys())[:10]:
    print(f"  ‚Ä¢ {term}")

print("\nüî¥ TOP 10 T√âRMINOS EN TWEETS NEGATIVOS:")
for term in list(vec_neg.vocabulary_.keys())[:10]:
    print(f"  ‚Ä¢ {term}")

Analizando top features por clase...
üü¢ TOP 10 T√âRMINOS EN TWEETS POSITIVOS:
  ‚Ä¢ love
  ‚Ä¢ im
  ‚Ä¢ thanks
  ‚Ä¢ got
  ‚Ä¢ too
  ‚Ä¢ just
  ‚Ä¢ so
  ‚Ä¢ time
  ‚Ä¢ like
  ‚Ä¢ day

üî¥ TOP 10 T√âRMINOS EN TWEETS NEGATIVOS:
  ‚Ä¢ got
  ‚Ä¢ day
  ‚Ä¢ today
  ‚Ä¢ like
  ‚Ä¢ no
  ‚Ä¢ not
  ‚Ä¢ all
  ‚Ä¢ im
  ‚Ä¢ just
  ‚Ä¢ but


In [9]:
# Top t√©rminos usando src
top_words_df = get_top_tfidf_words(tfidf_vectorizer, X_train_tfidf, n_top=20)

print("\nüîù Top 20 t√©rminos m√°s importantes (TF-IDF promedio):")
print(top_words_df)


üîù Top 20 t√©rminos m√°s importantes (TF-IDF promedio):
      word  tfidf_score
0       im     0.018156
1       so     0.015834
2     just     0.014049
3      but     0.013185
4      not     0.012851
5     good     0.011934
6      day     0.011151
7       no     0.010250
8      all     0.010094
9     like     0.010013
10    love     0.009807
11    work     0.009619
12     too     0.009448
13   going     0.009181
14    dont     0.009137
15   today     0.009093
16     got     0.008427
17     lol     0.008413
18  thanks     0.008405
19    time     0.008002


## 8. Preparar Features Num√©ricas

Las 7 features extra√≠das en el preprocesamiento

In [10]:
# Columnas de features num√©ricas
numeric_features = ['length', 'num_words', 'num_hashtags', 'num_mentions', 
                   'num_urls', 'num_uppercase', 'pct_uppercase']

# Extraer features num√©ricas
X_train_numeric = df_train[numeric_features].values
X_test_numeric = df_test[numeric_features].values

print(f"Features num√©ricas train: {X_train_numeric.shape}")
print(f"Features num√©ricas test: {X_test_numeric.shape}")

print(f"\nüìä Estad√≠sticas features num√©ricas (TRAIN):")
print(df_train[numeric_features].describe().round(2))

Features num√©ricas train: (1596781, 7)
Features num√©ricas test: (359, 7)

üìä Estad√≠sticas features num√©ricas (TRAIN):
           length   num_words  num_hashtags  num_mentions    num_urls  \
count  1596781.00  1596781.00    1596781.00    1596781.00  1596781.00   
mean        74.20       13.20          0.03          0.49        0.05   
std         36.38        6.94          0.22          0.59        0.23   
min          6.00        1.00          0.00          0.00        0.00   
25%         44.00        7.00          0.00          0.00        0.00   
50%         69.00       12.00          0.00          0.00        0.00   
75%        104.00       19.00          0.00          1.00        0.00   
max        374.00       64.00         24.00         12.00        5.00   

       num_uppercase  pct_uppercase  
count     1596781.00     1596781.00  
mean            3.26           6.22  
std             5.21           9.77  
min             0.00           0.00  
25%             1.00        

## 9. Combinar TF-IDF + Features Num√©ricas

Matriz final = [TF-IDF texto] + [7 features num√©ricas]

In [11]:
print("Combinando features...\n")

# Convertir features num√©ricas a sparse matrix
X_train_numeric_sparse = csr_matrix(X_train_numeric)
X_test_numeric_sparse = csr_matrix(X_test_numeric)

# Concatenar horizontalmente (TF-IDF + numeric)
X_train_final = hstack([X_train_tfidf, X_train_numeric_sparse])
X_test_final = hstack([X_test_tfidf, X_test_numeric_sparse])

print("‚úì Features combinadas")
print(f"\nTRAIN:")
print(f"  - TF-IDF: {X_train_tfidf.shape}")
print(f"  - Num√©ricas: {X_train_numeric.shape}")
print(f"  - FINAL: {X_train_final.shape}")

print(f"\nTEST:")
print(f"  - TF-IDF: {X_test_tfidf.shape}")
print(f"  - Num√©ricas: {X_test_numeric.shape}")
print(f"  - FINAL: {X_test_final.shape}")

Combinando features...

‚úì Features combinadas

TRAIN:
  - TF-IDF: (1596781, 10000)
  - Num√©ricas: (1596781, 7)
  - FINAL: (1596781, 10007)

TEST:
  - TF-IDF: (359, 10000)
  - Num√©ricas: (359, 7)
  - FINAL: (359, 10007)


In [12]:
import sys

print(f"\nüíæ USO DE MEMORIA:")
print(f"  X_train_final: {X_train_final.data.nbytes / 1024**2:.1f} MB")
print(f"  X_test_final: {X_test_final.data.nbytes / 1024**2:.1f} MB")
print(f"  Equivalente denso: {(X_train_final.shape[0] * X_train_final.shape[1] * 8) / 1024**3:.1f} GB")


üíæ USO DE MEMORIA:
  X_train_final: 132.8 MB
  X_test_final: 0.0 MB
  Equivalente denso: 119.1 GB


## 10. Preparar Etiquetas (y)

In [13]:
# Convertir polaridad 0/4 a 0/1 para clasificaci√≥n binaria
y_train = (df_train['polarity'].values == 4).astype(int)
y_test = (df_test['polarity'].values == 4).astype(int)

print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

print(f"\nBalance TRAIN:")
print(f"  - Negativos (0): {(y_train == 0).sum()}")
print(f"  - Positivos (1): {(y_train == 1).sum()}")

print(f"\nBalance TEST:")
print(f"  - Negativos (0): {(y_test == 0).sum()}")
print(f"  - Positivos (1): {(y_test == 1).sum()}")

y_train: (1596781,)
y_test: (359,)

Balance TRAIN:
  - Negativos (0): 798383
  - Positivos (1): 798398

Balance TEST:
  - Negativos (0): 177
  - Positivos (1): 182


## 11. Guardar Vectores y Objetos

In [14]:
# Guardar objetos usando src
import os
os.makedirs('../data/vectorized', exist_ok=True)

save_object(X_train_final, '../vectorized/X_train.pkl')
save_object(X_test_final, '../vectorized/X_test.pkl')
save_object(y_train, '../vectorized/y_train.pkl')
save_object(y_test, '../vectorized/y_test.pkl')
save_object(tfidf_vectorizer, '../vectorized/tfidf_vectorizer.pkl')

print("‚úì Datos vectorizados guardados")

‚úì Objeto guardado en: d:\Diplomatura en ia\trabajo practico 3 -Omar Gonzalez\tp3_nlp_sentiment\data\processed\..\vectorized\X_train.pkl
‚úì Objeto guardado en: d:\Diplomatura en ia\trabajo practico 3 -Omar Gonzalez\tp3_nlp_sentiment\data\processed\..\vectorized\X_test.pkl
‚úì Objeto guardado en: d:\Diplomatura en ia\trabajo practico 3 -Omar Gonzalez\tp3_nlp_sentiment\data\processed\..\vectorized\y_train.pkl
‚úì Objeto guardado en: d:\Diplomatura en ia\trabajo practico 3 -Omar Gonzalez\tp3_nlp_sentiment\data\processed\..\vectorized\y_test.pkl
‚úì Objeto guardado en: d:\Diplomatura en ia\trabajo practico 3 -Omar Gonzalez\tp3_nlp_sentiment\data\processed\..\vectorized\tfidf_vectorizer.pkl
‚úì Datos vectorizados guardados


In [15]:
## 9.1 Cobertura de Vocabulario en TEST

# T√©rminos √∫nicos en test
test_vocab = set()
for text in df_test['text_clean'].fillna(''):
    test_vocab.update(text.split())

# T√©rminos del vectorizador presentes en test
vocab_terms = set(tfidf_vectorizer.vocabulary_.keys())
overlap = test_vocab.intersection(vocab_terms)

print(f"\nüìä COBERTURA DE VOCABULARIO:")
print(f"  T√©rminos √∫nicos en TEST: {len(test_vocab)}")
print(f"  T√©rminos en vocabulario TF-IDF: {len(vocab_terms)}")
print(f"  T√©rminos en com√∫n: {len(overlap)}")
print(f"  Cobertura: {len(overlap)/len(test_vocab)*100:.1f}%")


üìä COBERTURA DE VOCABULARIO:
  T√©rminos √∫nicos en TEST: 1584
  T√©rminos en vocabulario TF-IDF: 10000
  T√©rminos en com√∫n: 988
  Cobertura: 62.4%


## 12. Resumen Final

In [16]:
print("="*60)
print("RESUMEN DE VECTORIZACI√ìN")
print("="*60)

print(f"\nüìä DATOS FINALES:")
print(f"  Train: {X_train_final.shape[0]:,} tweets √ó {X_train_final.shape[1]:,} features")
print(f"  Test:  {X_test_final.shape[0]:,} tweets √ó {X_test_final.shape[1]:,} features")

print(f"\nüî§ COMPOSICI√ìN DE FEATURES:")
print(f"  - Vocabulario TF-IDF: {len(tfidf_vectorizer.vocabulary_):,} t√©rminos")
print(f"    * Unigramas: {len(unigramas):,}")
print(f"    * Bigramas: {len(bigramas):,}")
print(f"  - Features num√©ricas: 7")
print(f"  - TOTAL: {X_train_final.shape[1]:,} features")

print(f"\n‚úÖ DECISIONES DEL EDA IMPLEMENTADAS:")
print(f"  ‚úì TF-IDF con bigramas ('not good', 'very happy')")
print(f"  ‚úì Stopwords personalizadas (conservamos 'not', 'no', 'very')")
print(f"  ‚úì 7 features num√©ricas integradas")
print(f"  ‚úì Vocabulario limitado a {X_train_tfidf.shape[1]:,} features")
print(f"  ‚úì Matriz sparse eficiente (sparsity: {(1 - X_train_final.nnz / (X_train_final.shape[0] * X_train_final.shape[1])) * 100:.2f}%)")

print(f"\nüéØ LISTO PARA MODELADO")
print(f"  Pr√≥ximo paso: Notebook 4 - Entrenar modelos")

RESUMEN DE VECTORIZACI√ìN

üìä DATOS FINALES:
  Train: 1,596,781 tweets √ó 10,007 features
  Test:  359 tweets √ó 10,007 features

üî§ COMPOSICI√ìN DE FEATURES:
  - Vocabulario TF-IDF: 10,000 t√©rminos
    * Unigramas: 6,121
    * Bigramas: 3,879
  - Features num√©ricas: 7
  - TOTAL: 10,007 features

‚úÖ DECISIONES DEL EDA IMPLEMENTADAS:
  ‚úì TF-IDF con bigramas ('not good', 'very happy')
  ‚úì Stopwords personalizadas (conservamos 'not', 'no', 'very')
  ‚úì 7 features num√©ricas integradas
  ‚úì Vocabulario limitado a 10,000 features
  ‚úì Matriz sparse eficiente (sparsity: 99.89%)

üéØ LISTO PARA MODELADO
  Pr√≥ximo paso: Notebook 4 - Entrenar modelos


## 13. Word2Vec Embeddings

Word2Vec es otra t√©cnica de vectorizaci√≥n que genera embeddings densos (vectores de 100-300 dimensiones) donde palabras similares quedan cerca en el espacio vectorial.

A diferencia de TF-IDF que usa frecuencias, Word2Vec aprende representaciones sem√°nticas del contexto en que aparecen las palabras.

In [None]:
# =============================================================
# ENTRENAMIENTO DEL MODELO WORD2VEC (ejecutar solo si es necesario)
# =============================================================
# El modelo ya est√° entrenado y guardado en models/word2vec_model.pkl
# Descomentar solo si necesit√°s re-entrenar.

# from gensim.models import Word2Vec
# import pandas as pd
#
# # Cargar datos preprocesados
# df_train = pd.read_csv('../data/processed/train_processed.csv')
# sentences = [text.split() for text in df_train['text_clean'].fillna('').tolist()]
#
# # Entrenar Word2Vec
# model_w2v = Word2Vec(
#     sentences=sentences,
#     vector_size=100,
#     window=5,
#     min_count=5,
#     workers=4,
#     epochs=10
# )
#
# # Guardar modelo
# model_w2v.save('../models/word2vec_model.pkl')
# print(f"‚úì Modelo guardado con {len(model_w2v.wv):,} palabras")

In [1]:
# Verificar modelo guardado
from gensim.models import Word2Vec

model_w2v = Word2Vec.load('../models/word2vec_model.pkl')
print(f"‚úÖ Modelo Word2Vec cargado: {len(model_w2v.wv):,} palabras")
print(f"   Vector size: {model_w2v.wv.vector_size} dimensiones")

# Ejemplo de palabras similares
print(f"\nüìä Ejemplo - Palabras similares a 'happy':")
for word, score in model_w2v.wv.most_similar('happy', topn=5):
    print(f"   {word}: {score:.4f}")

‚úÖ Modelo Word2Vec cargado: 57,795 palabras
   Vector size: 100 dimensiones

üìä Ejemplo - Palabras similares a 'happy':
   happyy: 0.6584
   thrilled: 0.6207
   pleased: 0.6005
   sad: 0.5942
   upset: 0.5836
