## Leitura dos Dados

In [202]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [195]:
df = pd.read_csv('../imdb_reviews.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [208]:
idx = np.arange(df.shape[0])
train_idx = np.random.choice(idx, (), replace=False)
val_idx = np.random.choice(idx, 10, replace=False)

array([32348, 42105, 16876, 21039, 14527, 21796, 39662, 42565, 17341,
       14954])

# Funções

In [216]:
def map_sentiments(x):
    if x == 'positive':
        return 1
    return 0

def create_bag_of_words(df_train, df_val, text_col, vocab_size=1000):
    vectorizer = CountVectorizer(max_features=vocab_size)
    
    X_train = vectorizer.fit_transform(df_train[text_col].values)
    X_val = vectorizer.transform(df_val[text_col].values)
    
    bow_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())
    bow_val = pd.DataFrame(X_val.toarray(), columns=vectorizer.get_feature_names_out())
    
    df_train['target_val'] = df_train['sentiment'].values
    df_val['target_val'] = df_val['sentiment'].values
    
    return bow_train, bow_val

def cross_validation(df, model, score_func, k=5):
    idx_list = np.arange(df.shape[0])
    test_size = int((1/k) * len(idx_list)) 
    
    for _ in range(k):
        train_idx = np.random.choice(idx_list, len(idx_list) - test_size, replace=False)
        test_idx = np.random.choice(idx_list, test_size, replace=False)
        
        df_train = df[train_idx]
        df_test = df[test_idx]
        
        bow_train, bow_test = create_bag_of_words(df_train, df_test, 'review') 
        
        X_train = bow_train.drop('target_val', axis=1)
        y_train = bow_train['target_val'].values
        
        X_test = bow_test.drop('target_val', axis=1)
        y_test = bow_test['target_val'].values
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        print(score_func(y_test, preds))
        

In [189]:
df['sentiment'] = df['sentiment'].apply(map_sentiments)

In [190]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

print(f"df_train shape: {df_train.shape}")
print(f"df_val shape: {df_val.shape}")

df_train shape: (40000, 2)
df_val shape: (10000, 2)


In [197]:
bow_train, bow_val = create_bag_of_words(df_train, df_val, 'review', 1000)