In [2]:
import pandas as pd
import os, codecs, sys
from nltk.stem.porter import PorterStemmer

In [57]:
sent_class = "POS"
data_dir = f"data/reviews/"

data_pth = f"{data_dir}/{sent_class}/cv000_29590.tag"

# read in data from a single file (each file = a single review)
# help reading in data from: https://www.pythontutorial.net/python-basics/python-read-text-file/
with open(data_pth) as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines] # remove trailing new line 
    
# convert a single review into (token, pos-tag format)
def get_single_review(fpth):
    # read in data from a single file (each file = a single review)
    # help reading in data from: https://www.pythontutorial.net/python-basics/python-read-text-file/
    with open(fpth) as f:
        full_review_data = f.readlines()
        full_review_data = [l.strip() for l in full_review_data] # remove trailing new line 
        
    # token separatred by "\t" from POS
    parsed_review_data = []
    for token_data in full_review_data: 
        if "\t" not in token_data: continue
        token, pos_tag = token_data.split("\t")
        parsed_review_data.append((token, pos_tag))
        
    return parsed_review_data

sentiment_classes = ["POS", "NEG"]

# maintain lists that we want info from 
train_info = []
test_info = []
cv_info = {}

for sent_class in sentiment_classes: 
    sent_dir = f"{data_dir}{sent_class}/" # "sent" = "sentiment"
    all_reviews = [rev for rev in os.listdir(sent_dir) if rev[-4:] == ".tag"]
    
    # process each review and put in associated train/test based on file number
    # (also determines fold)
    for review_file_name in all_reviews: 
        
        fold_num = int(review_file_name[3]) # all start w/ cv
        parsed_review_data = get_single_review(f"{sent_dir}{review_file_name}")
        review_metadata = [sent_class, parsed_review_data]
        
        if fold_num == 9: 
            test_info.append(review_metadata)
        else: 
            train_info.append(review_metadata)
            
        if fold_num not in cv_info: cv_info[fold_num] = [review_metadata]
        else: cv_info[fold_num].append(review_metadata)
        

In [55]:
len(train_info)

1800

In [37]:
all_reviews[0][3]

'1'

In [46]:
fpth = f"{sent_dir}{review_file_name}"

In [47]:
with open(fpth) as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines] # remove trailing new line 

In [48]:
lines

['Though\tIN',
 'made\tVBN',
 'in\tIN',
 'Canada\tNNP',
 ',\t,',
 '``\t``',
 'Overdrawn\tJJ',
 'at\tIN',
 'the\tDT',
 'Memory\tNN',
 'Bank\tNNP',
 "''\t''",
 'traces\tNNS',
 'its\tPRP$',
 'roots\tNNS',
 'to\tTO',
 'the\tDT',
 'BBC\tNNP',
 'school\tNN',
 'of\tIN',
 'film\tNN',
 'production\tNN',
 '.\t.',
 '',
 'Using\tVBG',
 ',\t,',
 'for\tIN',
 'the\tDT',
 'most\tJJS',
 'part\tNN',
 ',\t,',
 'cheap\tJJ',
 'computer\tNN',
 'and\tCC',
 'video\tNN',
 'special\tJJ',
 'effects\tNNS',
 ',\t,',
 'chyron\tNN',
 'text\tNN',
 ',\t,',
 'wildlife\tNN',
 'documentary\tNN',
 'footage\tNN',
 ',\t,',
 'and\tCC',
 'sets\tVBZ',
 'that\tDT',
 'are\tVBP',
 'sometimes\tRB',
 'obviously\tRB',
 'faked\tVBN',
 'up\tRP',
 ',\t,',
 'it\tPRP',
 'nonetheless\tRB',
 'manages\tVBZ',
 'to\tTO',
 'tell\tVB',
 'a\tDT',
 'good\tJJ',
 'enough\tJJ',
 'science\tNN',
 'fiction\tNN',
 'story\tNN',
 'that\tWDT',
 'by\tIN',
 'the\tDT',
 'time\tNN',
 'you\tPRP',
 "'re\tVBP",
 'twenty\tCD',
 'minutes\tNNS',
 'into\tIN',
 'it\tP