# Opinion Spam - Data Preparation

## Download Dataset

In [9]:
import io
import requests
import zipfile

In [10]:
# ! rm -fr ../datasets/opinion_spam

In [18]:
dataset_url = "https://myleott.com/op_spam_v1.4.zip"
dataset_dir = "../datasets/opinion_spam"

file_content = requests.get(dataset_url).content
zipfile.ZipFile(io.BytesIO(file_content)).extractall(dataset_dir)

In [82]:
! ls -al $dataset_dir/op_spam_v1.4/positive_polarity/

total 0
drwxr-xr-x  4 gr33ndata  staff  128 Apr 14 14:46 [34m.[m[m
drwxr-xr-x  6 gr33ndata  staff  192 Apr 14 14:46 [34m..[m[m
drwxr-xr-x  7 gr33ndata  staff  224 Apr 14 14:46 [34mdeceptive_from_MTurk[m[m
drwxr-xr-x  7 gr33ndata  staff  224 Apr 14 14:46 [34mtruthful_from_TripAdvisor[m[m


# Extract Data and Metadata 

### Get a list of all text files

In [100]:
import os

In [83]:
def list_text_files(dataset_dir):
    selected_files = []
    for root, _, files in os.walk(dataset_dir):
        for file in files:
            if file.endswith('.txt'):
                selected_files.append(os.path.join(root, file))
    return selected_files

In [84]:
list_text_files(dataset_dir)[:3]

['../datasets/opinion_spam/op_spam_v1.4/positive_polarity/deceptive_from_MTurk/fold2/d_talbott_9.txt',
 '../datasets/opinion_spam/op_spam_v1.4/positive_polarity/deceptive_from_MTurk/fold2/d_talbott_8.txt',
 '../datasets/opinion_spam/op_spam_v1.4/positive_polarity/deceptive_from_MTurk/fold2/d_affinia_20.txt']

### Extract matadata from the folder structure 

In [85]:
def get_meta_data(filename):
    # Use `os.sep` instead of `/` to work for both Window and Unix/OSX
    _, polarity, deceptivene, fold, file_name = filename.rsplit(sep=os.sep, maxsplit=4)
    return {
        'fold': int(fold[-1]),
        'polarity': 1 if polarity.startswith("positive") else -1,
        'deceptive': 1 if deceptivene.startswith("deceptive") else 0,
    }

In [86]:
filename = list_text_files(dataset_dir)[0]

get_meta_data(filename)

{'fold': 2, 'polarity': 1, 'deceptive': 1}

### Read file content 

In [87]:
def get_content(filename):
    with open(filename, 'r') as f:
        text = f.read()
    return {'text': text}

In [88]:
filename = list_text_files(dataset_dir)[0]

get_content(filename)

{'text': 'excellent staff and customer service, very clean and spotless. elegant and luxurious with a beautiful ocean view. the bed is very comfortable and relaxing. i give it a five star.\n'}

### Combine Data and Metadata

In [92]:
data = [
    {
        **get_meta_data(filename), 
        **get_content(filename)
    }
    for filename in list_text_files(dataset_dir)
]

In [94]:
data[3]

{'fold': 2,
 'polarity': 1,
 'deceptive': 1,
 'text': 'I have to say that the Hard Rock Hotel in Chicago is a cool place to stay. When I first got there, getting checked in was very fast. The lady at the counter was nice and well spoken. The bellhop had out bags into the room very fast. The room was very hip and cool. There was musical items all over the place. I really loved the Beatles art that was in my bathroom. After being in the room a few minutes, I got a call from the front desk asking me if I liked my room. They also asked me if I would be interested in things like spa treatments and or if I needed to use their business room. I needed some things shipped while I was there, which they had supplies to help me with that. After getting some work done, I called the concierge if there was some really good pizza places around. They gave me information for 3 awesome places. I left and came back after having a night out on the town. They were very understanding that I was a wee bit out

## Data to Pandas DataFrame

In [97]:
import pandas as pd

In [98]:
df = pd.DataFrame(data)

In [99]:
df.sample(3)

Unnamed: 0,fold,polarity,deceptive,text
1115,3,-1,1,I will never stay here again. I was lured in b...
843,2,-1,1,While the Hard Rock Hotel in Chicago promises ...
208,4,1,1,The Sheraton Chicago Hotel and Towers is a mag...


In [101]:
df.to_csv(os.path.join(dataset_dir, "prepared_data.csv"))