# Goal: 
- Extract Reviews and Category Content for Each Product Id

## load Modules, Read Raw Reviews, Check Count of Reviews

In [1]:
import pandas as pd
import re
import numpy as np
from itertools import islice

raw_reviews = open("amazon-meta.txt",encoding="utf8")
raw_reviews = raw_reviews.readlines()
print("Number of lines", len(raw_reviews))

Number of lines 15010574


## Extract IDS

In [2]:
class extract:
    def __init__(self, N = None):
        if N == None:
            self.content = open("amazon-meta.txt",encoding="utf8")
        else: 
            self.content = open("amazon-meta.txt",encoding="utf8")
            self.content = list(islice(self.content, N))
       
        self.indexed_content= enumerate(self.content, 1)

    def find(self,search_by, columns = None):
        self.line_content = []
        self.line_location = []
        
        for num, line in self.indexed_content:
            if re.match(search_by, np.str.strip(line)) is None:
                pass
            else:
                self.line_location.append(num)
                self.line_content.append(line)
                
        if columns == None:
            self.extracted_content = [np.str.split(np.str.strip(i)) for i in self.line_content] # Keep these as strings so can look into during debug
        else:
            self.extracted_content = [[np.str.split(np.str.strip(i))[column] for column in columns] for i in self.line_content] # Keep these as strings so can look into
             

In [3]:
id_extraction = extract()
id_extraction.find("^Id:")

review_extraction = extract()
review_extraction.find("^reviews:")

category_extraction = extract()
category_extraction.find("^categories:")

df_id_locations = pd.DataFrame({"start":id_extraction.line_location,"ID":[int(i[1]) for i in id_extraction.extracted_content]})
df_id_locations['end'] = df_id_locations.start.shift(-1)

In [4]:
extract_group = extract()
extract_group.find('group:')

extract_rank = extract()
extract_rank.find('salesrank:')

extract_sim = extract()
extract_sim.find('similar:')


## Build Index of Content Location

### 1. EXTRACT CHUNKS

In [5]:
file = extract()
file = [i for i in file.content]

In [6]:
[file[i] for i in np.arange(1,100)]

['Total items: 548552\n',
 '\n',
 'Id:   0\n',
 'ASIN: 0771044445\n',
 '  discontinued product\n',
 '\n',
 'Id:   1\n',
 'ASIN: 0827229534\n',
 '  title: Patterns of Preaching: A Sermon Sampler\n',
 '  group: Book\n',
 '  salesrank: 396585\n',
 '  similar: 5  0804215715  156101074X  0687023955  0687074231  082721619X\n',
 '  categories: 2\n',
 '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Preaching[12368]\n',
 '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Sermons[12370]\n',
 '  reviews: total: 2  downloaded: 2  avg rating: 5\n',
 '    2000-7-28  cutomer: A2JW67OY8U6HHK  rating: 5  votes:  10  helpful:   9\n',
 '    2003-12-14  cutomer: A2VE83MZF98ITY  rating: 5  votes:   6  helpful:   5\n',
 '\n',
 'Id:   2\n',
 'ASIN: 0738700797\n',
 '  title: Candlemas: Feast of Flames\n',
 '  group: Book\n',
 '  salesrank: 168596\n',
 '  similar: 5  0738700827  1567184960  1567182836  0738700525  0738

In [7]:
rows = df_id_locations.apply(tuple, axis=1)
chunks = []
for i in rows:
    c,a,b = i
    chunks.append([c]+[file[int(i)] for i in np.arange(a-1,b-2)])

### 2. Build Range

In [8]:
ids = []
expandedrange = []
for i in df_id_locations.iterrows():    
    prod_id = i[1][0]
    row_range = np.arange(i[1][1], i[1][2])
    length = len(row_range)
    ids += [prod_id]*length
    expandedrange += list(row_range)
df_id_row_lookup = pd.DataFrame(list(zip(ids,expandedrange)), columns = ['ID', 'Rows'])

### 3. IDENTIFY REVIEW / CATEGORY MATCH in RANGES

In [9]:
df_id_row_lookup['review_start'] = df_id_row_lookup.Rows.isin(review_extraction.line_location) + 0
df_id_row_lookup['category_start'] = df_id_row_lookup.Rows.isin(category_extraction.line_location) + 0 

In [10]:
df_id_row_lookup['group_start'] = df_id_row_lookup.Rows.isin(extract_group.line_location) + 0 
df_id_row_lookup['rank_start'] = df_id_row_lookup.Rows.isin(extract_rank.line_location) + 0 
df_id_row_lookup['sim_start'] = df_id_row_lookup.Rows.isin(extract_sim.line_location) + 0 




### 4.  JOIN RANGE TO REVIEW / CAT LOCATION. BUILD DF AND JOIN TO Original

In [11]:
detail_index = df_id_row_lookup
detail_index = pd.melt(detail_index, id_vars= ['ID','Rows'])
detail_index = detail_index.query("value != 0")
detail_index = pd.pivot_table(index = ["ID"], columns = 'variable', values='Rows', data = detail_index)
df_id_locations = df_id_locations.merge(detail_index, left_on= "ID", right_index= True, how = 'right')
df_chunk_lengths = pd.DataFrame([[i[0], len(i)] for i in chunks], columns = ["ID", "CHUNK_LEN"])
df_id_locations = df_id_locations.merge(df_chunk_lengths, on = "ID", how = 'left')

### 5. Create and write id tables for rank, group, and similiarity 

In [12]:
df_id_locations.head()

Unnamed: 0,ID,start,end,category_start,group_start,rank_start,review_start,sim_start,CHUNK_LEN
0,1,8,21.0,14.0,11.0,12.0,17.0,13.0,13
1,2,21,44.0,27.0,24.0,25.0,30.0,26.0,23
2,3,44,55.0,50.0,47.0,48.0,52.0,49.0,11
3,4,55,70.0,61.0,58.0,59.0,67.0,60.0,15
4,5,70,81.0,76.0,73.0,74.0,79.0,75.0,11


In [13]:
group_class = [np.str.split(np.str.strip(i))[1] for i in extract_group.line_content]
df_groups = pd.DataFrame(list(zip(extract_group.line_location,group_class)), columns = ['group_start','group'])
df_groups = df_groups.merge(df_id_locations[['ID','group_start']], how = 'inner')

rank_value = [np.str.split(np.str.strip(i))[1] for i in extract_rank.line_content]
df_rank = pd.DataFrame(list(zip(extract_rank.line_location,rank_value)), columns = ['rank_start','rank'])
df_rank = df_rank.merge(df_id_locations[['ID','rank_start']], how = 'inner')

df_rank_and_group = df_rank.merge(df_groups, on = 'ID').drop(['group_start','rank_start'],1)
df_rank_and_group.to_csv("rank_and_group.csv", index = False)

In [14]:
df_rank_and_group.head(3)

Unnamed: 0,rank,ID,group
0,396585,1,Book
1,168596,2,Book
2,1270652,3,Book


In [15]:
sim_items = [np.str.split(np.str.strip(i)) for i in extract_sim.line_content]
df_similar = pd.DataFrame(sim_items)
df_similar['sim_start'] = extract_sim.line_location
df_similar = pd.melt(id_vars = 'sim_start',frame= df_similar.iloc[:,2:])
df_similar = df_similar.merge(df_id_locations[['ID','sim_start']], how = 'inner').dropna()
df_similar.to_csv('similar.csv',index = False)
df_similar.head(3)

Unnamed: 0,sim_start,variable,value,ID
0,13,2,0804215715,1
1,13,3,156101074X,1
2,13,4,0687023955,1


# Build Content Extractor

In [16]:
def extract_detail_rows(columns, dont_split = True):
    within_lines = df_id_locations[columns].apply(tuple, axis=1)
    detail_collector = []
    for row in within_lines:
        prod_id,a,b = row

        if (b == a):
            review_detail_collector.append([[int(prod_id)] + [np.nan]*5])

        prod_id = int(prod_id)
        search_space = chunks[prod_id]
        query_range = np.arange(a,b-2)
        
        if dont_split == True:
            detail_collector += [[prod_id, file[int(line)]] for line in query_range]
        else: 
            detail_collector += [[prod_id] + np.str.split(file[int(line)]) for line in query_range]
    return detail_collector


## Extract / Create Reviews FRAME

In [17]:
reviews_frame = extract_detail_rows(columns =  ['ID','review_start', 'end'], dont_split= False)
reviews_frame = pd.DataFrame(reviews_frame)
reviews_frame = reviews_frame.drop([2,4,6,8], axis = 1)
reviews_frame.columns = ['PROD_ID','REVIEW_DATE','CUSTOMER_ID','RATING','VOTES','HELPFUL']

## Extract / Create CATEGORY FRAME

In [19]:
categories = extract_detail_rows(columns =  ['ID','category_start', 'review_start'], dont_split= True)

### Parse / Organize categories

In [20]:
counter = 0
all_categories = []
for case in categories:
    prod_id = case[0]
    counter += 1
    category_content = case[1]
    category_content = np.str.strip(category_content)
    category_content = re.sub(r"^[|]|[]]", "", category_content)
    category_content = category_content.split("|")
    sub_count = 0 
    for sub_case in category_content:
        sub_count += 1
        all_categories += [[prod_id] + [counter] + [sub_count] +  sub_case.split("[")]   

In [21]:
category_frame = pd.DataFrame(all_categories, columns = ['PROD_ID', 'TBL_ID','CATEGORY_ORDER','DESC','CAT_CODE','X'])
category_frame1 = category_frame[category_frame.X.isnull()].drop("X",1)
category_frame2 = category_frame[~category_frame.X.isnull()]
category_frame2['DESC'] = category_frame2.DESC.str.strip() + " (" + category_frame2.CAT_CODE.str.strip()+")"
category_frame2['CAT_CODE'] = category_frame2.X
category_frame2 = category_frame2.drop("X",1)
category_frame = category_frame1.append(category_frame2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## WRITE TO CSV

In [22]:
category_frame.to_csv("categories.csv")
reviews_frame.to_csv("reviews.csv")