# Opinion Spam - Text Classification

In [16]:
import os 

import pandas as pd

In [17]:
dataset_dir = "../datasets/opinion_spam"

df = pd.read_pickle(os.path.join(dataset_dir, "prepared_data.pkl"))

In [18]:
df.head(3)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,1,1,"excellent staff and customer service, very cle..."
1,2,1,1,My stay at this hotel was one of the best I ha...
2,2,1,1,We just got back from a trip to Chicago for my...


# Count Vectorization

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
sentences = [
    'Malaga is in Spain',
    'Paris is in France',
]

In [32]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences)

CountVectorizer()

In [250]:
# Vocabulary
vocabulary = vectorizer.get_feature_names_out().tolist()

# Vectors
vectors = vectorizer.transform(sentences).todense()

In [55]:
vocabulary

['france', 'in', 'is', 'malaga', 'paris', 'spain']

In [56]:
vectors

matrix([[0, 1, 1, 1, 0, 1],
        [1, 1, 1, 0, 1, 0]])

In [52]:
# Combine Vocabulary and Vectors

pd.DataFrame(
    vectors,
    columns=vocabulary
)

Unnamed: 0,france,in,is,malaga,paris,spain
0,0,1,1,1,0,1
1,1,1,1,0,1,0


# Build your first classifier 

In [84]:
df.sample(3, random_state=42)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
526,5,1,0,We booked this hotel for the second leg of our...
354,1,1,1,This hotel is great! I love its interior desig...
168,4,1,1,My husband and I had a wonderful time at Swiss...


In [85]:
sorted(df['fold'].unique())

[1, 2, 3, 4, 5]

In [89]:
train_df = df[df.fold.isin([1, 2, 3])]
test_df = df[df.fold.isin([4, 5])]

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(train_df['text'])

x_train = vectorizer.transform(train_df['text'])
x_test = vectorizer.transform(test_df['text'])

In [91]:
x_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [108]:
y_train, y_test = train_df['deceptive'], test_df['deceptive']

In [109]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(x_train, y_train)

y_hat_test = classifier.predict(x_test) # We do not give it y_test

In [141]:
test_df['predicted_deceptive'] = y_hat_test

In [127]:
test_df[['deceptive', 'predicted_deceptive']].sample(7, random_state=44)

Unnamed: 0_level_0,deceptive,predicted_deceptive
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
996,1,0
508,0,1
978,1,1
590,0,0
950,1,1
1335,0,0
600,0,0


In [142]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=y_hat_test)

0.8625

## Note about Accuracy Score

### Question

Say percentage of y_test=1 is 86%, 
and we have a model that predicts everything as deceptive regardless. 

What will its accuracy be in this case?

In [144]:
y_test.mean()

0.5

# Pipeline

In [227]:
def to_xy(df, text_col='text', label='deceptive'):
    return df[text_col], df[label]

In [228]:
x_train, y_train = to_xy(train_df)
x_test, y_test = to_xy(test_df)

In [229]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

classifier_pipeline.fit(x_train, y_train)
y_hat_test = classifier_pipeline.predict(x_test)

In [230]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=y_hat_test)

0.89375

In [231]:
classifier_pipeline.named_steps['multinomialnb'].feature_log_prob_.shape

(2, 8656)

In [232]:
features_df = pd.DataFrame(
    {
        'Token': classifier_pipeline.named_steps['countvectorizer'].get_feature_names_out(),
        'LogProba': classifier_pipeline.named_steps['multinomialnb'].feature_log_prob_[1,:]
    }
)

features_df.sort_values('LogProba', ascending=False)

Unnamed: 0,Token,LogProba
7698,the,-2.734479
546,and,-3.446895
7814,to,-3.527208
8388,was,-3.652839
3996,in,-4.074774
...,...,...
4197,irena,-11.480877
4200,ironing,-11.480877
4203,irritating,-11.480877
4206,islands,-11.480877


## Note about our train / test split

We used folds 1, 2 and 3 for training, and folds 4 and 5 for testing.

What if for some reason, the model finds it easier to predict the samples at folds 4 and 5,
compared to, say, predicting the samples at folds 1 and 2, or 2 and 3, etc.

How can we make sure we chose the right split? 

In [182]:
len(y_train), len(y_test)

(1280, 320)

# K-Fold Cross Validation

In [183]:
df.sample(3, random_state=42)

Unnamed: 0_level_0,fold,polarity,deceptive,text
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
526,5,1,0,We booked this hotel for the second leg of our...
354,1,1,1,This hotel is great! I love its interior desig...
168,4,1,1,My husband and I had a wonderful time at Swiss...


In [184]:
sorted(df['fold'].unique())

[1, 2, 3, 4, 5]

In [185]:
# df[df['fold'] != 1].shape[0], df[df['fold'] == 1].shape[0]
len(df[df['fold'] != 1]), len(df[df['fold'] == 1])

(1280, 320)

In [186]:
def kflod_splits(df):
    folds = sorted(df['fold'].unique())
    for fold in folds:
        yield fold, df[df['fold'] != fold], df[df['fold'] == fold]

In [187]:
for fold, train_df, test_df in kflod_splits(df):
    print(f"{fold=}: {len(train_df)=} and {len(test_df)=}")

fold=1: len(train_df)=1280 and len(test_df)=320
fold=2: len(train_df)=1280 and len(test_df)=320
fold=3: len(train_df)=1280 and len(test_df)=320
fold=4: len(train_df)=1280 and len(test_df)=320
fold=5: len(train_df)=1280 and len(test_df)=320


In [193]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

print('Predicting deceptive reviews:\n')

for fold, train_df, test_df in kflod_splits(df):
    
    x_train, y_train = to_xy(train_df, label='deceptive')
    x_test, y_test = to_xy(test_df, label='deceptive')
    
    classifier_pipeline.fit(x_train, y_train)
    
    y_hat_train = classifier_pipeline.predict(x_train)
    train_score = accuracy_score(y_train, y_hat_train)
    
    y_hat_test = classifier_pipeline.predict(x_test)
    test_score = accuracy_score(y_test, y_hat_test)
    
    print(f"{fold=} {train_score=:.1%}, {test_score=:.1%}")


Predicting deceptive reviews:

fold=1 train_score=97.3%, test_score=81.2%
fold=2 train_score=97.0%, test_score=82.8%
fold=3 train_score=97.3%, test_score=90.0%
fold=4 train_score=97.0%, test_score=84.4%
fold=5 train_score=96.9%, test_score=89.4%


In [248]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))

print('Predicting deceptive reviews:\n')

for fold, train_df, test_df in kflod_splits(df):
    
    x_train, y_train = to_xy(train_df, label='deceptive')
    x_test, y_test = to_xy(test_df, label='deceptive')
    
    classifier_pipeline.fit(x_train, y_train)
    
    y_hat_train = classifier_pipeline.predict(x_train)
    train_score = accuracy_score(y_train, y_hat_train)
    
    y_hat_test = classifier_pipeline.predict(x_test)
    test_score = accuracy_score(y_test, y_hat_test)
    
    print(f"{fold=} {train_score=:.1%}, {test_score=:.1%}")


Predicting deceptive reviews:

fold=1 train_score=100.0%, test_score=82.2%
fold=2 train_score=100.0%, test_score=85.9%
fold=3 train_score=100.0%, test_score=84.4%
fold=4 train_score=100.0%, test_score=83.1%
fold=5 train_score=100.0%, test_score=87.8%


In [233]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

classifier_pipeline = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))

print('Predicting review polarity:\n')

for fold, train_df, test_df in kflod_splits(df):
    
    x_train, y_train = to_xy(train_df, label='polarity')
    x_test, y_test = to_xy(test_df, label='polarity')
    
    classifier_pipeline.fit(x_train, y_train)
    
    y_hat_train = classifier_pipeline.predict(x_train)
    train_score = accuracy_score(y_train, y_hat_train)
    
    y_hat_test = classifier_pipeline.predict(x_test)
    test_score = accuracy_score(y_test, y_hat_test)
    
    print(f"{fold=} {train_score=:.1%}, {test_score=:.1%}")


Predicting review polarity:

fold=1 train_score=100.0%, test_score=90.9%
fold=2 train_score=100.0%, test_score=94.4%
fold=3 train_score=99.9%, test_score=94.4%
fold=4 train_score=100.0%, test_score=92.8%
fold=5 train_score=100.0%, test_score=93.4%


In [249]:
features_df = pd.DataFrame(
    {
        'Token': classifier_pipeline.named_steps['countvectorizer'].get_feature_names_out(),
        'LogProba': classifier_pipeline.named_steps['logisticregression'].coef_[0]
    }
)

features_df.sort_values('LogProba', ascending=False)

Unnamed: 0,Token,LogProba
1516,chicago,1.109011
8230,vacation,1.041924
4657,luxury,0.872378
4949,modern,0.787608
6215,relax,0.770007
...,...,...
1179,breakfast,-0.931939
4569,location,-0.974684
7365,street,-1.015468
3174,floor,-1.034595


In [246]:
classifier_pipeline.named_steps['logisticregression'].coef_[0].shape

(8656,)