# Opinion Spam - Text Classification

In [3]:
import os 

import pandas as pd

In [4]:
dataset_dir = "../datasets/opinion_spam"

df = pd.read_pickle(os.path.join(dataset_dir, "prepared_data.pkl"))

In [5]:
df.head(3)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,1,1,"excellent staff and customer service, very cle..."
1,2,1,1,My stay at this hotel was one of the best I ha...
2,2,1,1,We just got back from a trip to Chicago for my...


# Count Vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
sentences = [
    'Malaga is in Spain',
    'Paris is in France',
]

In [8]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences)

CountVectorizer()

In [9]:
# Vocabulary
vocabulary = vectorizer.get_feature_names_out().tolist()

# Vectors
vectors = vectorizer.transform(sentences).todense()

In [10]:
vocabulary

['france', 'in', 'is', 'malaga', 'paris', 'spain']

In [11]:
vectors

matrix([[0, 1, 1, 1, 0, 1],
        [1, 1, 1, 0, 1, 0]])

In [12]:
# Combine Vocabulary and Vectors

pd.DataFrame(
    vectors,
    columns=vocabulary
)

Unnamed: 0,france,in,is,malaga,paris,spain
0,0,1,1,1,0,1
1,1,1,1,0,1,0


# Build your first classifier 

In [13]:
df.sample(3, random_state=42)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
526,5,1,0,We booked this hotel for the second leg of our...
354,1,1,1,This hotel is great! I love its interior desig...
168,4,1,1,My husband and I had a wonderful time at Swiss...


In [14]:
sorted(df['fold'].unique())

[1, 2, 3, 4, 5]

In [15]:
train_df = df[df.fold.isin([1, 2, 3])]
test_df = df[df.fold.isin([4, 5])]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(train_df['text'])

x_train = vectorizer.transform(train_df['text'])
x_test = vectorizer.transform(test_df['text'])

In [17]:
x_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [18]:
y_train, y_test = train_df['deceptive'], test_df['deceptive']

In [19]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(x_train, y_train)

y_hat_test = classifier.predict(x_test) # We do not give it y_test

In [24]:
pd.set_option('mode.chained_assignment', None)

In [25]:
test_df['predicted_deceptive'] = y_hat_test

In [26]:
test_df[['deceptive', 'predicted_deceptive']].sample(7, random_state=44)

Unnamed: 0_level_0,deceptive,predicted_deceptive
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
996,1,0
508,0,1
978,1,1
590,0,0
950,1,1
1335,0,0
600,0,0


In [27]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=y_hat_test)

0.8625

## Note about Accuracy Score

**[Question]** 

Say percentage of y_test=1 is 86%, 
and we have a model that predicts everything as deceptive regardless. 

What will its accuracy be in this case?

# Helper Function

In [30]:
def to_xy(df, text_col='text', label='deceptive'):
    return df[text_col], df[label]

In [31]:
x_train, y_train = to_xy(train_df)
x_test, y_test = to_xy(test_df)

# Pipeline

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

In [33]:
x_train, y_train = to_xy(train_df)
x_test, y_test = to_xy(test_df)

classifier_pipeline.fit(x_train, y_train)
y_hat_test = classifier_pipeline.predict(x_test)

In [34]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=y_hat_test)

0.8625

## Note about our train / test split

We used folds 1, 2 and 3 for training, and folds 4 and 5 for testing.

What if for some reason, the model finds it easier to predict the samples at folds 4 and 5,
compared to, say, predicting the samples at folds 1 and 2, or 2 and 3, etc.

How can we make sure we chose the right split? 

In [35]:
len(y_train), len(y_test)

(960, 640)

# K-Fold Cross Validation

In [36]:
df.sample(3, random_state=42)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
526,5,1,0,We booked this hotel for the second leg of our...
354,1,1,1,This hotel is great! I love its interior desig...
168,4,1,1,My husband and I had a wonderful time at Swiss...


In [37]:
sorted(df['fold'].unique())

[1, 2, 3, 4, 5]

In [38]:
# df[df['fold'] != 1].shape[0], df[df['fold'] == 1].shape[0]
len(df[df['fold'] != 1]), len(df[df['fold'] == 1])

(1280, 320)

In [39]:
def kflod_splits(df):
    folds = sorted(df['fold'].unique())
    for fold in folds:
        yield fold, df[df['fold'] != fold], df[df['fold'] == fold]

In [40]:
for fold, train_df, test_df in kflod_splits(df):
    print(f"{fold=}: {len(train_df)=} and {len(test_df)=}")

fold=1: len(train_df)=1280 and len(test_df)=320
fold=2: len(train_df)=1280 and len(test_df)=320
fold=3: len(train_df)=1280 and len(test_df)=320
fold=4: len(train_df)=1280 and len(test_df)=320
fold=5: len(train_df)=1280 and len(test_df)=320


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

print('Predicting deceptive reviews:\n')

for fold, train_df, test_df in kflod_splits(df):
    
    x_train, y_train = to_xy(train_df)
    x_test, y_test = to_xy(test_df)
    
    classifier_pipeline.fit(x_train, y_train)
    
    y_hat_train = classifier_pipeline.predict(x_train)
    train_score = accuracy_score(y_train, y_hat_train)
    
    y_hat_test = classifier_pipeline.predict(x_test)
    test_score = accuracy_score(y_test, y_hat_test)
    
    print(f"{fold=} {train_score=:.1%}, {test_score=:.1%}")


Predicting deceptive reviews:

fold=1 train_score=97.3%, test_score=81.2%
fold=2 train_score=97.0%, test_score=82.8%
fold=3 train_score=97.3%, test_score=90.0%
fold=4 train_score=97.0%, test_score=84.4%
fold=5 train_score=96.9%, test_score=89.4%


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))

print('Predicting deceptive reviews:\n')

for fold, train_df, test_df in kflod_splits(df):
    
    x_train, y_train = to_xy(train_df)
    x_test, y_test = to_xy(test_df)
    
    classifier_pipeline.fit(x_train, y_train)
    
    y_hat_train = classifier_pipeline.predict(x_train)
    train_score = accuracy_score(y_train, y_hat_train)
    
    y_hat_test = classifier_pipeline.predict(x_test)
    test_score = accuracy_score(y_test, y_hat_test)
    
    print(f"{fold=} {train_score=:.1%}, {test_score=:.1%}")


Predicting deceptive reviews:

fold=1 train_score=100.0%, test_score=82.2%
fold=2 train_score=100.0%, test_score=85.9%
fold=3 train_score=100.0%, test_score=84.4%
fold=4 train_score=100.0%, test_score=83.1%
fold=5 train_score=100.0%, test_score=87.8%


## Exercise 

- Can you classify the documents based on their `polarity` instead of whether they are `deceptive` or not?
- Since we have 5 folds, we get 5 train and 5 test scores. Print the average train and test score for each classifeir.
- Try different classifiers, e.g. LogisticRegression, MultinomialNB, what else?
- Can we combine two classifiers to help each other? 
- Check the documentations for the classifiers you use, and try different parameters. 