### Data Preparation for AWS Comprehend

In [10]:
import pandas as pd

df = pd.read_csv("s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/train.csv", names=["Label", "Title", "Review"])

In [11]:
neg_df = df[df['Label'] == 1]

In [13]:
neg_df.head()

Unnamed: 0,Label,Title,Review
6,1,Buyer beware,"This is a self-published book, and if you want..."
10,1,The Worst!,A complete waste of time. Typographical errors...
13,1,Oh please,I guess you have to be a romance novel lover f...
14,1,Awful beyond belief!,I feel I have to write to keep others from was...
15,1,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...


In [5]:
neg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800000 entries, 6 to 3599998
Data columns (total 3 columns):
Label     int64
Title     object
Review    object
dtypes: int64(1), object(2)
memory usage: 54.9+ MB


In [6]:
extract_df = neg_df.head(40000)['Review']

In [7]:
extract_df.count()

40000

In [8]:
extract_df.head(10)

6     This is a self-published book, and if you want...
10    A complete waste of time. Typographical errors...
13    I guess you have to be a romance novel lover f...
14    I feel I have to write to keep others from was...
15    It's glaringly obvious that all of the glowing...
19    sizes are much smaller than what is recomended...
20    This model may be ok for sedentary types, but ...
22    Rather than scratches and insect droppings, th...
25    I have had the charger for more than two years...
26    I bought one of these chargers..the instructio...
Name: Review, dtype: object

In [9]:
extract_df.to_csv('extract_df.csv', index=False)

  if __name__ == '__main__':


In [10]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
#print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'NLP_AWS/NLP_AWS' #Replace with the prefix under which you want to store the data if needed

sagemaker-us-east-1-023375022819


In [10]:
comprehend_channel = prefix + '/comprehend'

sess.upload_data(path='extract_df.csv', bucket=bucket, key_prefix=comprehend_channel)

's3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/extract_df.csv'

Use the above output as Input for Amazon Comprehend to generate topics.

### Initial Analysis of Topic Modeling from 20k Input and 40k Input

In [1]:
!aws s3 cp s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/023375022819-TOPICS-f0b9b40086d3cc882f48800be187f3fb/output/output.tar.gz .

Completed 68.0 KiB/68.0 KiB (763.5 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/023375022819-TOPICS-f0b9b40086d3cc882f48800be187f3fb/output/output.tar.gz to ./output.tar.gz


In [2]:
!tar -xvzf output.tar.gz  

topic-terms.csv
doc-topics.csv


In [3]:
!cat topic-terms.csv

topic,term,weight
000,item,0.01320014
000,price,0.009271662
000,cheap,0.008077917
000,product,0.0145492125
000,return,0.00996442
000,buy,0.022574656
000,worth,0.0070977435
000,pay,0.005717317
000,wear,0.0050701024
000,shirt,0.0037684275
001,movie,0.0891871
001,watch,0.026982259
001,make,0.020209568
001,wrong,0.016609019
001,time,0.018557215
001,funny,0.0075101038
001,waste,0.011658727
001,plot,0.008867379
001,bore,0.011109356
001,minute,0.006217682
002,book,0.082376644
002,story,0.030742362
002,read,0.026288286
002,character,0.01778436
002,write,0.018424777
002,find,0.017625522
002,interest,0.014726332
002,plot,0.012445922
002,good,0.019640913
002,bore,0.013027136
003,film,0.04789476
003,bad,0.023904858
003,movie,0.023667585
003,act,0.011379912
003,star,0.010218274
003,script,0.008100315
003,make,0.013481576
003,watch,0.011116439
003,keanu,0.0064930655
003,performance,0.0065185176
004,work,0.061424877
004,product,0.039028663
004,great,0.015444

In [4]:
import pandas as pd
tenk_df = pd.read_csv("doc-topics.csv")
tenk_df.head()

Unnamed: 0,docname,topic,proportion
0,extract_df.csv:20,0,1.0
1,extract_df.csv:55,0,1.0
2,extract_df.csv:90,0,1.0
3,extract_df.csv:125,0,1.0
4,extract_df.csv:160,2,1.0


In [5]:
tenk_term_df = pd.read_csv("topic-terms.csv")
tenk_term_df.head()

Unnamed: 0,topic,term,weight
0,0,item,0.0132
1,0,price,0.009272
2,0,cheap,0.008078
3,0,product,0.014549
4,0,return,0.009964


### create topic term lookup table

In [6]:
tenk_topic_df = tenk_term_df.sort_values(['topic','weight'],ascending=False).groupby('topic').head(5).drop('weight', axis=1)


In [7]:
tenk_topic_join_df = tenk_topic_df.sort_values(by='topic')
tenk_topic_join_df.head()

Unnamed: 0,topic,term
1,0,price
5,0,buy
4,0,return
0,0,item
3,0,product


In [8]:

topic_join_name_df = tenk_topic_join_df.groupby('topic')['term'].apply(lambda x: '|'.join(x)).rename('topic_term').reset_index()

topic_join_name_df

Unnamed: 0,topic,topic_term
0,0,price|buy|return|item|product
1,1,time|make|watch|movie|wrong
2,2,write|good|read|story|book
3,3,film|bad|make|movie|act
4,4,stop|week|great|product|work
5,5,buy|work|battery|month|charge
6,6,poor|wrong|sound|quality|product
7,7,toy|play|money|love|fall
8,8,kindle|amazon|edition|original|version
9,9,time|act|watch|bad|movie


#### merge loopup table: topic_join_name_df with tenk_df

In [9]:
print(tenk_df.shape)
tenk_topic_merge_df = tenk_df.merge(topic_join_name_df,how='outer', left_on = ['topic'], right_on = ['topic'])
print(tenk_topic_merge_df.shape)
tenk_topic_merge_df.head(10)

(12882, 3)
(12882, 4)


Unnamed: 0,docname,topic,proportion,topic_term
0,extract_df.csv:20,0,1.0,price|buy|return|item|product
1,extract_df.csv:55,0,1.0,price|buy|return|item|product
2,extract_df.csv:90,0,1.0,price|buy|return|item|product
3,extract_df.csv:125,0,1.0,price|buy|return|item|product
4,extract_df.csv:230,0,1.0,price|buy|return|item|product
5,extract_df.csv:405,0,1.0,price|buy|return|item|product
6,extract_df.csv:510,0,0.366276,price|buy|return|item|product
7,extract_df.csv:615,0,0.415374,price|buy|return|item|product
8,extract_df.csv:650,0,1.0,price|buy|return|item|product
9,extract_df.csv:685,0,0.65592,price|buy|return|item|product


In [14]:
### merge tenk_topic_merge_df with extract_10k_df
extract_10k_df = neg_df.head(10000)
extract_10k_df.head()

Unnamed: 0,Label,Title,Review
6,1,Buyer beware,"This is a self-published book, and if you want..."
10,1,The Worst!,A complete waste of time. Typographical errors...
13,1,Oh please,I guess you have to be a romance novel lover f...
14,1,Awful beyond belief!,I feel I have to write to keep others from was...
15,1,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...


In [15]:
tenk_topic_merge_df['csv_index']=tenk_topic_merge_df['docname'].str.extract('(\d+)').astype(int).apply(lambda x: x-1)
tenk_topic_merge_df.head()

Unnamed: 0,docname,topic,proportion,topic_term,csv_index
0,extract_df.csv:20,0,1.0,price|buy|return|item|product,19
1,extract_df.csv:55,0,1.0,price|buy|return|item|product,54
2,extract_df.csv:90,0,1.0,price|buy|return|item|product,89
3,extract_df.csv:125,0,1.0,price|buy|return|item|product,124
4,extract_df.csv:230,0,1.0,price|buy|return|item|product,229


In [16]:
extract_10k_df.reset_index(inplace=True)

In [17]:
extract_10k_df.drop(['index'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [18]:


tenk_final_merge_df = extract_10k_df.merge(tenk_topic_merge_df,how='outer', left_on = extract_10k_df.index, right_on = ['csv_index'])
tenk_final_merge_df.head(200)

Unnamed: 0,Label,Title,Review,docname,topic,proportion,topic_term,csv_index
0,1.0,Buyer beware,"This is a self-published book, and if you want...",extract_df.csv:1,1.0,1.000000,time|make|watch|movie|wrong,0
1,1.0,The Worst!,A complete waste of time. Typographical errors...,extract_df.csv:2,2.0,1.000000,write|good|read|story|book,1
2,1.0,Oh please,I guess you have to be a romance novel lover f...,extract_df.csv:3,0.0,1.000000,price|buy|return|item|product,2
3,1.0,Awful beyond belief!,I feel I have to write to keep others from was...,extract_df.csv:4,0.0,1.000000,price|buy|return|item|product,3
4,1.0,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...,extract_df.csv:5,0.0,1.000000,price|buy|return|item|product,4
5,1.0,sizes recomended in the size chart are not real,sizes are much smaller than what is recomended...,extract_df.csv:6,0.0,1.000000,price|buy|return|item|product,5
6,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,0.0,0.611489,price|buy|return|item|product,6
7,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,3.0,0.388511,film|bad|make|movie|act,6
8,1.0,Another Abysmal Digital Copy,"Rather than scratches and insect droppings, th...",extract_df.csv:8,5.0,1.000000,buy|work|battery|month|charge,7
9,1.0,Problem with charging smaller AAAs,I have had the charger for more than two years...,extract_df.csv:9,5.0,1.000000,buy|work|battery|month|charge,8


In [19]:
tenk_final_df = tenk_final_merge_df[tenk_final_merge_df['proportion']==1.]


In [20]:
tenk_final_df.shape

(8678, 8)

In [21]:
tenk_final_df.to_csv('tenk_final_df.csv', index=False)

In [21]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
#print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'NLP_AWS/NLP_AWS' #Replace with the prefix under which you want to store the data if needed

sagemaker-us-east-1-023375022819


In [25]:
comprehend_channel = prefix + '/comprehend'

sess.upload_data(path='tenk_final_df.csv', bucket=bucket, key_prefix=comprehend_channel)

's3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/tenk_final_df.csv'

's3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/tenk_final_df.csv'

#### Load 40k comprehend data

In [1]:
!aws s3 cp s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/023375022819-TOPICS-872ef6b19d4829f30b057247a939ca48/output/output.tar.gz .


Completed 256.0 KiB/297.6 KiB (1.7 MiB/s) with 1 file(s) remainingCompleted 297.6 KiB/297.6 KiB (2.0 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/023375022819-TOPICS-872ef6b19d4829f30b057247a939ca48/output/output.tar.gz to ./output.tar.gz


In [2]:
!tar -xvzf output.tar.gz 

topic-terms.csv
doc-topics.csv


In [3]:
!cat topic-terms.csv

topic,term,weight
000,movie,0.09402398
000,watch,0.02498362
000,wrong,0.014464636
000,funny,0.007826147
000,make,0.013933469
000,bore,0.010897398
000,good,0.014267884
000,act,0.0072559235
000,part,0.0057911
000,star,0.007622383
001,book,0.06389944
001,story,0.029499054
001,read,0.033440992
001,character,0.01939331
001,write,0.016529718
001,interest,0.010646802
001,plot,0.00959722
001,author,0.007844011
001,line,0.0065978603
001,series,0.006087168
002,break,0.0154708335
002,year,0.01267386
002,month,0.0118638575
002,set,0.009456752
002,disc,0.009191847
002,plastic,0.007345996
002,toy,0.0067676883
002,buy,0.020888986
002,piece,0.00568495
002,fall,0.0053911866
003,product,0.03875014
003,buy,0.037172183
003,work,0.020472776
003,break,0.010205354
003,cheap,0.009635811
003,quality,0.010484237
003,price,0.008577088
003,money,0.014248242
003,purchase,0.009578801
003,item,0.0096434755
004,quality,0.03067479
004,poor,0.017959643
004,sound,0.018441437
0

In [4]:
import pandas as pd
foutyk_df = pd.read_csv("doc-topics.csv")
foutyk_df.head()

Unnamed: 0,docname,topic,proportion
0,extract_df.csv:9,17,1.0
1,extract_df.csv:44,1,1.0
2,extract_df.csv:79,1,1.0
3,extract_df.csv:114,1,1.0
4,extract_df.csv:149,1,0.779731


In [5]:
foutyk_term_df = pd.read_csv("topic-terms.csv")
foutyk_term_df.head()

Unnamed: 0,topic,term,weight
0,0,movie,0.094024
1,0,watch,0.024984
2,0,wrong,0.014465
3,0,funny,0.007826
4,0,make,0.013933


In [6]:
foutyk_topic_df = foutyk_term_df.sort_values(['topic','weight'],ascending=False).groupby('topic').head(5).drop('weight', axis=1)

In [7]:
foutyk_topic_join_df = foutyk_topic_df.sort_values(by='topic')
foutyk_topic_join_df.head()

Unnamed: 0,topic,term
4,0,make
0,0,movie
6,0,good
2,0,wrong
1,0,watch


In [8]:

fouty_topic_join_name_df = foutyk_topic_join_df.groupby('topic')['term'].apply(lambda x: '|'.join(x)).rename('topic_term').reset_index()

fouty_topic_join_name_df

Unnamed: 0,topic,topic_term
0,0,make|movie|good|wrong|watch
1,1,character|story|read|book|write
2,2,set|month|year|break|buy
3,3,product|buy|money|work|quality
4,4,good|poor|sound|dvd|quality
5,5,star|book|give|review|movie
6,6,edition|dvd|original|buy|version
7,7,bad|film|watch|act|make
8,8,wrong|act|good|bad|movie
9,9,stop|month|great|product|work


In [9]:
print(foutyk_df.shape)
foutyk_topic_merge_df = foutyk_df.merge(fouty_topic_join_name_df,how='outer', left_on = ['topic'], right_on = ['topic'])
print(foutyk_topic_merge_df.shape)
foutyk_topic_merge_df.head(10)

(54446, 3)
(54446, 4)


Unnamed: 0,docname,topic,proportion,topic_term
0,extract_df.csv:9,17,1.0,charge|battery|work|phone|buy
1,extract_df.csv:639,17,1.0,charge|battery|work|phone|buy
2,extract_df.csv:1479,17,0.167973,charge|battery|work|phone|buy
3,extract_df.csv:2634,17,0.253236,charge|battery|work|phone|buy
4,extract_df.csv:2914,17,1.0,charge|battery|work|phone|buy
5,extract_df.csv:3614,17,1.0,charge|battery|work|phone|buy
6,extract_df.csv:5889,17,0.569225,charge|battery|work|phone|buy
7,extract_df.csv:7814,17,0.132194,charge|battery|work|phone|buy
8,extract_df.csv:9704,17,1.0,charge|battery|work|phone|buy
9,extract_df.csv:15199,17,0.517387,charge|battery|work|phone|buy


In [35]:
### merge tenk_topic_merge_df with extract_10k_df
extract_40k_df = neg_df.head(40000)
extract_40k_df.head()

Unnamed: 0,Label,Title,Review
6,1,Buyer beware,"This is a self-published book, and if you want..."
10,1,The Worst!,A complete waste of time. Typographical errors...
13,1,Oh please,I guess you have to be a romance novel lover f...
14,1,Awful beyond belief!,I feel I have to write to keep others from was...
15,1,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...


Unnamed: 0,Label,Title,Review
6,1,Buyer beware,"This is a self-published book, and if you want..."
10,1,The Worst!,A complete waste of time. Typographical errors...
13,1,Oh please,I guess you have to be a romance novel lover f...
14,1,Awful beyond belief!,I feel I have to write to keep others from was...
15,1,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...


In [36]:
foutyk_topic_merge_df['csv_index']=foutyk_topic_merge_df['docname'].str.extract('(\d+)').astype(int).apply(lambda x: x-1)
foutyk_topic_merge_df.head()

Unnamed: 0,docname,topic,proportion,topic_term,csv_index
0,extract_df.csv:9,17,1.0,charge|battery|work|phone|buy,8
1,extract_df.csv:639,17,1.0,charge|battery|work|phone|buy,638
2,extract_df.csv:1479,17,0.167973,charge|battery|work|phone|buy,1478
3,extract_df.csv:2634,17,0.253236,charge|battery|work|phone|buy,2633
4,extract_df.csv:2914,17,1.0,charge|battery|work|phone|buy,2913


Unnamed: 0,docname,topic,proportion,topic_term,csv_index
0,extract_df.csv:9,17,1.0,charge|battery|work|phone|buy,8
1,extract_df.csv:639,17,1.0,charge|battery|work|phone|buy,638
2,extract_df.csv:1479,17,0.167973,charge|battery|work|phone|buy,1478
3,extract_df.csv:2634,17,0.253236,charge|battery|work|phone|buy,2633
4,extract_df.csv:2914,17,1.0,charge|battery|work|phone|buy,2913


In [37]:
extract_40k_df.reset_index(inplace=True)

In [38]:
extract_40k_df.drop(['index'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [39]:


foutyk_final_merge_df = extract_40k_df.merge(foutyk_topic_merge_df,how='outer', left_on = extract_40k_df.index, right_on = ['csv_index'])
foutyk_final_merge_df.head(200)

Unnamed: 0,Label,Title,Review,docname,topic,proportion,topic_term,csv_index
0,1.0,Buyer beware,"This is a self-published book, and if you want...",extract_df.csv:1,1.0,1.000000,character|story|read|book|write,0
1,1.0,The Worst!,A complete waste of time. Typographical errors...,extract_df.csv:2,6.0,1.000000,edition|dvd|original|buy|version,1
2,1.0,Oh please,I guess you have to be a romance novel lover f...,extract_df.csv:3,5.0,1.000000,star|book|give|review|movie,2
3,1.0,Awful beyond belief!,I feel I have to write to keep others from was...,extract_df.csv:4,5.0,1.000000,star|book|give|review|movie,3
4,1.0,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...,extract_df.csv:5,18.0,1.000000,return|order|fit|small|size,4
5,1.0,sizes recomended in the size chart are not real,sizes are much smaller than what is recomended...,extract_df.csv:6,8.0,1.000000,wrong|act|good|bad|movie,5
6,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,13.0,0.138473,disscusting|line.ii|segment|waist|waste,6
7,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,0.0,0.040126,make|movie|good|wrong|watch,6
8,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,7.0,0.424071,bad|film|watch|act|make,6
9,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,2.0,0.059160,set|month|year|break|buy,6


Unnamed: 0,Label,Title,Review,docname,topic,proportion,topic_term,csv_index
0,1.0,Buyer beware,"This is a self-published book, and if you want...",extract_df.csv:1,1.0,1.000000,character|story|read|book|write,0
1,1.0,The Worst!,A complete waste of time. Typographical errors...,extract_df.csv:2,6.0,1.000000,edition|dvd|original|buy|version,1
2,1.0,Oh please,I guess you have to be a romance novel lover f...,extract_df.csv:3,5.0,1.000000,star|book|give|review|movie,2
3,1.0,Awful beyond belief!,I feel I have to write to keep others from was...,extract_df.csv:4,5.0,1.000000,star|book|give|review|movie,3
4,1.0,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...,extract_df.csv:5,18.0,1.000000,return|order|fit|small|size,4
5,1.0,sizes recomended in the size chart are not real,sizes are much smaller than what is recomended...,extract_df.csv:6,8.0,1.000000,wrong|act|good|bad|movie,5
6,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,13.0,0.138473,disscusting|line.ii|segment|waist|waste,6
7,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,0.0,0.040126,make|movie|good|wrong|watch,6
8,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,7.0,0.424071,bad|film|watch|act|make,6
9,1.0,mens ultrasheer,"This model may be ok for sedentary types, but ...",extract_df.csv:7,2.0,0.059160,set|month|year|break|buy,6


In [40]:
foutyk_final_df = foutyk_final_merge_df[foutyk_final_merge_df['proportion']==1.]

In [41]:
foutyk_final_df.shape

(33942, 8)

(33942, 8)

In [42]:
foutyk_final_df.to_csv('foutyk_final_df.csv', index=False)

In [43]:
comprehend_channel = prefix + '/comprehend'

sess.upload_data(path='foutyk_final_df.csv', bucket=bucket, key_prefix=comprehend_channel)

's3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/foutyk_final_df.csv'

's3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/comprehend/foutyk_final_df.csv'