## Leitura dos Dados

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
df = pd.read_csv('../imdb_reviews.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Funções

In [30]:
def map_sentiments(x):
    if x == 'positive':
        return 1
    return 0

def create_bag_of_words(df_train, df_val, text_col, vocab_size=500):
    vectorizer = CountVectorizer(max_features=vocab_size)
    
    X_train = vectorizer.fit_transform(df_train[text_col].values)
    X_val = vectorizer.transform(df_val[text_col].values)
    
    bow_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())
    bow_val = pd.DataFrame(X_val.toarray(), columns=vectorizer.get_feature_names_out())
    
    bow_train['target_val'] = df_train['sentiment'].values
    bow_val['target_val'] = df_val['sentiment'].values
    
    return bow_train, bow_val

def cross_validation(df, model, k=5):
    idx_list = np.arange(df.shape[0])
    test_available_idx = np.arange(df.shape[0])
    test_size = int((1/k) * len(idx_list))
    acc = 0
    
    print("Accuracy per fold:")
    for i in range(k):
        test_idx = np.random.choice(test_available_idx, test_size, replace=False)
        train_idx = np.setdiff1d(idx_list, test_idx)
        
        df_train = df.iloc[train_idx]
        df_test = df.iloc[test_idx]
        
        bow_train, bow_test = create_bag_of_words(df_train, df_test, 'review') 
        
        X_train = bow_train.drop('target_val',axis=1)
        y_train = bow_train['target_val'].values
        
        X_test = bow_test.drop('target_val', axis=1)
        y_test = bow_test['target_val'].values
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        test_available_idx = np.setdiff1d(test_available_idx, test_idx)
        score = accuracy_score(y_test, preds)
        acc += score
        print(f"  Fold number {i+1}: {score:.3f}")
        
    print(f"Average accuracy score is {acc/k:.3f}")
    
    return model
        

In [27]:
df['sentiment'] = df['sentiment'].apply(map_sentiments)

In [31]:
model = LogisticRegression(max_iter=1000)
cross_validation(df, model)

Accuracy per fold:
  Fold number 1: 0.842
  Fold number 2: 0.845
  Fold number 3: 0.845
  Fold number 4: 0.847
  Fold number 5: 0.842
Average accuracy score is 0.844
