# Download Import libraries

In [84]:
!pip install benepar
!pip install gingerit
!pip install Unidecode
!pip install word2number
!pip install contractions
!pip install beautifulsoup4



In [1]:
import json
import pandas as pd
import re
import string
import random

In [2]:
import spacy
import en_core_web_sm
import benepar
import numpy as np
import math
from gingerit.gingerit import GingerIt

In [3]:
import unidecode
from word2number import w2n
import contractions
from bs4 import BeautifulSoup

In [222]:
# create directory for result
from pathlib import Path
Path("../result/").mkdir(parents=True, exist_ok=True)

# Utility function

In [134]:
# utility function

# split into sentences, list of all reviews
def split_into_sentence(all_review, path):
    # use spacy to split sentences
    nlp = spacy.load('en_core_web_sm')
    
    all_sentences = []
    count = 0
    num_sentences = 0
    
    for t in all_review:
        if count%10 == 0:
            print(count)
        count += 1

        doc = nlp(t)
        sentences = list(doc.sents)
        sentences = [i.text for i in sentences]
        all_sentences.extend(sentences)

    my_df = pd.DataFrame(all_sentences)
    my_df.to_csv(path, index=False, header=False)
    return my_df


# consituency parsing into csv
def split_into_sentence_parse_sf_csv(all_review, path):
    # use spacy to split sentences
    nlp = spacy.load('en_core_web_sm')
    
    parse_dict = {}
    all_sentences = []
    count = 0
    num_sentences = 0
    
    for t in all_review:
        if count%10 == 0:
            print(count)
        count += 1

        doc = nlp(t)
        sentences = list(doc.sents)
        sentences = [i.text for i in sentences]
        
        for s in sentences:
            num_sentences += 1
            adjp_list = parse_using_standford(tuple([s, ""]))
            parse_dict[s] = ', '.join(adjp_list)

    print('num sentences {}'.format(num_sentences))
    my_df = pd.DataFrame([parse_dict]).transpose()
    my_df.to_csv(path)

# Preprocessing functions

In [27]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text

def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

def remove_non_word_char(text):
    text = re.sub('[\W]+', '', text.lower())
    return text

def remove_extra_spaces(text):
    text = re.sub(r' +', ' ', text)
    return text

def remove_emo(text):
    emoticons = find_emo(text)
    for e in emoticons:
        text = text.replace(e, '')
    return text

def find_emo(text):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    return emoticons

# Extraction functions

In [42]:
from  nltk.parse.corenlp import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')

grammar_spell_checker = GingerIt()

# methods to do consistuency parsing with stanford core
def connect():
    parser = CoreNLPParser(url='http://localhost:9000')
    print('reconnecting to standford core server ... ')
    return parser

# preprocessing done before splitting into respective sentences
def preprocessing(text):
    # noise reduction
    
    # remove html text
    text = strip_html_tags(text)
    
    # remove emoji
    text = remove_emo(text)
    
    text = remove_accented_chars(text)
    
    text = expand_contractions(text)

    # remove white spaces
    text = remove_extra_spaces(text)
    
    # lower case
    text = text.lower()
    
    return text
    

def check_grammar_spelling(s):
    new_sentence_list = []
    try:
        r = grammar_spell_checker.parse(s)
    except KeyError:
        pass
        print('pass exception')
        return s
    return r['result']

def parse_using_standford(sentence_tuple, shallow=False):
    result_list = []
    sentence_tuple = tuple([preprocessing(i) for i in sentence_tuple])
    sentences = parser.raw_parse_sents(sentence_tuple)
    for line in sentences:
        for sentence in line:
            # change this shallow
            if shallow:
                have_adjp, adjp_list = traverse_tree_sf_shallow(sentence)
            else:
                have_adjp, adjp_list = traverse_tree_sf(sentence)
            result_list.extend(adjp_list)
    return result_list

def traverse_tree_sf_shallow(t):
    try:
        t.label()
    except AttributeError:
        return
    adjp_list = []
    subtree_have_adjp = False
    if t.label().find("ADJP") != -1:
        return True, [" ".join(t.leaves())]
    for subtree in t:
        have_adjp = False
        if subtree.height() != 2:
            have_adjp, adjp_list_subtree = traverse_tree_sf_shallow(subtree)
            
        if (have_adjp):
            subtree_have_adjp = True
            adjp_list.extend(adjp_list_subtree)

    return subtree_have_adjp, adjp_list

def traverse_tree_sf(t):
    try:
        t.label()
    except AttributeError:
        return
    adjp_list = []
    subtree_have_adjp = False
    
    for subtree in t:
        have_adjp = False
        if subtree.height() != 2:
            have_adjp, adjp_list_subtree = traverse_tree_sf(subtree)
            
        if (have_adjp):
            subtree_have_adjp = True
            adjp_list.extend(adjp_list_subtree)

    if t.label().find("ADJP") != -1:
        if (subtree_have_adjp):
            return True, adjp_list
        else:
            return True, [" ".join(t.leaves())]
    else:
        return subtree_have_adjp, adjp_list

# process df
def process_text_split_sentence_sf(df, spelling_grammar_checker=False, shallow=False):
    # use spacy to split sentences
    nlp = spacy.load('en_core_web_sm')
    text = df['text'].astype('str')
    count = 0
    num_sentences = 0
    print('Number of text {}'.format(len(text)))
    adjp_dict={}
    batch = 500
    for i in range(batch, len(text)+1, batch):
        print('batch {} from {}'.format(i/batch, i))
        for r in range(batch):
            print('text id {}'.format(i-batch+r))
            if spelling_grammar_checker:
                cur = check_grammar_spelling(text[i-batch+r])
            else:
                cur = text[i-batch+r]
            doc = nlp(cur)
            generator = list(doc.sents)
            sentence_tuple = [x.text for x in doc.sents]
            sentence_tuple = tuple(sentence_tuple)
            adjp_list = parse_using_standford(sentence_tuple, shallow)
            for x in range(len(adjp_list)):
                key = adjp_list[x].lower()  # change everything to lower
                if key in adjp_dict:
                    adjp_dict[key] += 1
                else:
                    adjp_dict[key] = 1
        
        # save into json
        with open('../result/run9/adjp_sf_batch_{}.json'.format(i), 'w') as file:
            file.write(json.dumps(adjp_dict)) # use `json.loads` to do the reverse
            
        # connect to server again
        parser = connect()
    return adjp_dict

# processs list of strings
def process_plain_text(all_review, shallow=False):
    parser = connect()
    # use spacy to split sentences
    nlp = spacy.load('en_core_web_sm')

    count = 0
    num_sentences = 0
    print('Number of text {}'.format(len(all_review)))
    adjp_dict={}
    for i in range(len(all_review)):
        print('text id {}'.format(i))
        doc = nlp(all_review[i])
        generator = list(doc.sents)
        sentence_tuple = [x.text for x in doc.sents]
        sentence_tuple = tuple(sentence_tuple)
        
        adjp_list = parse_using_standford(sentence_tuple, shallow)
        for x in range(len(adjp_list)):
            key = adjp_list[x].lower()  # change everything to lower
            if key in adjp_dict:
                adjp_dict[key] += 1
            else:
                adjp_dict[key] = 1
        
    return adjp_dict

# Functions for postprocessing

In [158]:
# post processing of data
def remove_all_punctuation(data_dict):
    new_dict = {}
    count = 0
    for k, v in data_dict.items():
        new_key = k.translate(str.maketrans('', '', string.punctuation))
        if new_key in new_dict:
            count += 1
            new_dict[new_key] += 1
        else:
            new_dict[new_key] = v
    print('Number of changes: {}'.format(count))
    return new_dict

def remove_adj_dict(data_dict):
    remove_list = []
    for k in data_dict:
        if len(k.split()) == 1:
            remove_list.append(k)
    
    for i in remove_list:
        del data_dict[i]
    return data_dict

def remove_extra_spaces_dict(data_dict):
    new_dict = {}
    for k,v in data_dict.items():
        new_dict[remove_extra_spaces(k).rstrip()] = v
    return new_dict

# descending
def sort_dict(data_dict):
    return {k: v for k, v in sorted(data_dict.items(), key=lambda item: item[1], reverse=True)}

def remove_adj_list(data_list):
    count = 0
    new_data_list = []
    for k in data_list:
        if len(k.split()) > 1:
            new_data_list.append(k)
        else:
            count += 1
    print('Number of adj removed: {}'.format(count))
    return new_data_list

def remove_extra_spaces_list(data_list):
    new_list = []
    for k in data_list:
        new_list.append(remove_extra_spaces(k).rstrip())
    return new_list

# post processing of data
def remove_all_punctuation_list(data_list):
    new_data_list = []
    count = 0
    for k in data_list:
        new_key = k.translate(str.maketrans('', '', string.punctuation))
        if new_key not in new_data_list:
            new_data_list.append(new_key)
            count += 1
    print('Number of changes: {}'.format(count))
    return new_data_list

def postprocessing_dict(data_dict):
    print('dict size before processing: {}'.format(len(data_dict)))
    data_dict = remove_all_punctuation(data_dict)
    data_dict = remove_extra_spaces_dict(data_dict)
    data_dict = remove_adj_dict(data_dict)
    print('dict size aftter processing: {}'.format(len(data_dict)))
    return sort_dict(data_dict)

def postprocessing_list(data_list):
    print('list size before processing: {}'.format(len(data_list)))
    data_list = remove_all_punctuation_list(data_list)
    data_list = remove_extra_spaces_list(data_list)
    data_list = remove_adj_list(data_list)
    print('list size aftter processing: {}'.format(len(data_list)))
    return data_list

# Function for getting biz & get indicative adjp

In [8]:
def get_random_business(df):
    unique_biz = df['business_id'].unique()
    print('unique biz {}'.format(len(unique_biz)))
    random_biz = random.choice(unique_biz)
    print('random biz choosen: {}'.format(random_biz))
    return random_biz

def get_all_reviews_from_biz(biz):
    rows = df.loc[df['business_id'] == biz]
    text = rows['text'].astype('str')
    text_list = rows['text'].tolist()
    print('Number of reviews: {}'.format(len(rows)))
    return tuple(text_list)

def get_indicative_adjp(common_adjp, biz_adjp_list):
    indicative_adjp = []
    for adjp in biz_adjp_list:
        if adjp not in common_adjp:
            indicative_adjp.append(adjp)
    return indicative_adjp

In [117]:
# frequent 5%
def get_count_for_top_n_percent(data_dict, percent):
    # try stats
    data_df = pd.DataFrame(data_dict, index=['count'])
    data_df = data_df.transpose()
    stats = data_df.describe()
    
    sorted_dict = sort_dict(data_dict)
    num_5_percent = round(int(stats.iloc[0])*(percent/100))
    print(percent/100)
    print('Number of adjp that are top 5% {}'.format(num_5_percent))
    count_for_5_percent = data_dict[list(sorted_dict)[num_5_percent]]
    print('Count for top 5% {}'.format(count_for_5_percent))
    return count_for_5_percent

# diff between 2 list
def Diff(li1, li2):
    return list(set(li1) - set(li2)), list(set(li2) - set(li1))

In [10]:
def list_to_csv(a_list, path):
    my_df = pd.DataFrame(a_list)
    my_df.to_csv(path, index=False, header=False)

# Load data

In [11]:
df = pd.read_json('../data/reviewSelected100.json', lines=True, encoding = "ISO-8859-1")
text = df['text'].astype('str')

# Run stanford parsing

## Deep extraction

In [43]:
%%time
result_dict_deep = process_text_split_sentence_sf(df, False)

Number of text 15300
batch 1.0 from 500
text id 0
text id 1
text id 2
text id 3
text id 4
text id 5
text id 6
text id 7
text id 8
text id 9
text id 10
text id 11
text id 12
text id 13
text id 14
text id 15
text id 16
text id 17
text id 18
text id 19
text id 20
text id 21
text id 22
text id 23
text id 24
text id 25
text id 26
text id 27
text id 28
text id 29
text id 30
text id 31
text id 32
text id 33
text id 34
text id 35
text id 36
text id 37
text id 38
text id 39
text id 40
text id 41
text id 42
text id 43
text id 44
text id 45
text id 46
text id 47
text id 48
text id 49
text id 50
text id 51
text id 52
text id 53
text id 54
text id 55
text id 56
text id 57
text id 58
text id 59
text id 60
text id 61
text id 62
text id 63




text id 64
text id 65
text id 66
text id 67
text id 68
text id 69
text id 70
text id 71
text id 72
text id 73
text id 74
text id 75
text id 76
text id 77
text id 78
text id 79
text id 80
text id 81
text id 82
text id 83
text id 84
text id 85
text id 86
text id 87
text id 88
text id 89
text id 90
text id 91
text id 92
text id 93
text id 94
text id 95
text id 96
text id 97
text id 98
text id 99
text id 100
text id 101
text id 102
text id 103
text id 104
text id 105
text id 106
text id 107
text id 108
text id 109
text id 110
text id 111
text id 112
text id 113
text id 114
text id 115
text id 116
text id 117
text id 118
text id 119
text id 120
text id 121
text id 122
text id 123
text id 124
text id 125
text id 126
text id 127
text id 128
text id 129
text id 130
text id 131
text id 132
text id 133
text id 134
text id 135
text id 136
text id 137
text id 138
text id 139
text id 140
text id 141
text id 142
text id 143
text id 144
text id 145
text id 146
text id 147
text id 148
text id 149
text


http://www.doctoroz.com/article/hard-look-lasik-surgery" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


text id 521
text id 522
text id 523
text id 524
text id 525
text id 526
text id 527
text id 528
text id 529
text id 530
text id 531
text id 532
text id 533
text id 534
text id 535
text id 536
text id 537
text id 538
text id 539
text id 540
text id 541
text id 542
text id 543
text id 544
text id 545
text id 546
text id 547
text id 548
text id 549
text id 550
text id 551
text id 552
text id 553
text id 554
text id 555
text id 556
text id 557
text id 558
text id 559
text id 560
text id 561
text id 562
text id 563
text id 564
text id 565
text id 566
text id 567
text id 568
text id 569
text id 570
text id 571
text id 572
text id 573
text id 574
text id 575
text id 576
text id 577
text id 578
text id 579
text id 580
text id 581
text id 582
text id 583
text id 584
text id 585
text id 586
text id 587
text id 588
text id 589
text id 590
text id 591
text id 592
text id 593
text id 594
text id 595
text id 596
text id 597
text id 598
text id 599
text id 600
text id 601
text id 602
text id 603
text

text id 1184
text id 1185
text id 1186
text id 1187
text id 1188
text id 1189
text id 1190
text id 1191
text id 1192
text id 1193
text id 1194
text id 1195
text id 1196
text id 1197
text id 1198
text id 1199
text id 1200
text id 1201
text id 1202
text id 1203
text id 1204
text id 1205
text id 1206
text id 1207
text id 1208
text id 1209
text id 1210
text id 1211
text id 1212
text id 1213
text id 1214
text id 1215
text id 1216
text id 1217
text id 1218
text id 1219
text id 1220
text id 1221
text id 1222
text id 1223
text id 1224
text id 1225
text id 1226
text id 1227
text id 1228
text id 1229
text id 1230
text id 1231
text id 1232
text id 1233
text id 1234
text id 1235
text id 1236
text id 1237
text id 1238
text id 1239
text id 1240
text id 1241
text id 1242
text id 1243
text id 1244
text id 1245
text id 1246
text id 1247
text id 1248
text id 1249
text id 1250
text id 1251
text id 1252
text id 1253
text id 1254
text id 1255
text id 1256
text id 1257
text id 1258
text id 1259
text id 1260

text id 1811
text id 1812
text id 1813
text id 1814
text id 1815
text id 1816
text id 1817
text id 1818
text id 1819
text id 1820
text id 1821
text id 1822
text id 1823
text id 1824
text id 1825
text id 1826
text id 1827
text id 1828
text id 1829
text id 1830
text id 1831
text id 1832
text id 1833
text id 1834
text id 1835
text id 1836
text id 1837
text id 1838
text id 1839
text id 1840
text id 1841
text id 1842
text id 1843
text id 1844
text id 1845
text id 1846
text id 1847
text id 1848
text id 1849
text id 1850
text id 1851
text id 1852
text id 1853
text id 1854
text id 1855
text id 1856
text id 1857
text id 1858
text id 1859
text id 1860
text id 1861
text id 1862
text id 1863
text id 1864
text id 1865
text id 1866
text id 1867
text id 1868
text id 1869
text id 1870
text id 1871
text id 1872
text id 1873
text id 1874
text id 1875
text id 1876
text id 1877
text id 1878
text id 1879
text id 1880
text id 1881
text id 1882
text id 1883
text id 1884
text id 1885
text id 1886
text id 1887

text id 2437
text id 2438
text id 2439
text id 2440
text id 2441
text id 2442
text id 2443
text id 2444
text id 2445
text id 2446
text id 2447
text id 2448
text id 2449
text id 2450
text id 2451
text id 2452
text id 2453
text id 2454
text id 2455
text id 2456
text id 2457
text id 2458
text id 2459
text id 2460
text id 2461
text id 2462
text id 2463
text id 2464
text id 2465
text id 2466
text id 2467
text id 2468
text id 2469
text id 2470
text id 2471
text id 2472
text id 2473
text id 2474
text id 2475
text id 2476
text id 2477
text id 2478
text id 2479
text id 2480
text id 2481
text id 2482
text id 2483
text id 2484
text id 2485
text id 2486
text id 2487
text id 2488
text id 2489
text id 2490
text id 2491
text id 2492
text id 2493
text id 2494
text id 2495
text id 2496
text id 2497
text id 2498
text id 2499
reconnecting to standford core server ... 
batch 6.0 from 3000
text id 2500
text id 2501
text id 2502
text id 2503
text id 2504
text id 2505
text id 2506
text id 2507
text id 2508
t

text id 3059
text id 3060
text id 3061
text id 3062
text id 3063
text id 3064
text id 3065
text id 3066
text id 3067
text id 3068
text id 3069
text id 3070
text id 3071
text id 3072
text id 3073
text id 3074
text id 3075
text id 3076
text id 3077
text id 3078
text id 3079
text id 3080
text id 3081
text id 3082
text id 3083
text id 3084
text id 3085
text id 3086
text id 3087
text id 3088
text id 3089
text id 3090
text id 3091
text id 3092
text id 3093
text id 3094
text id 3095
text id 3096
text id 3097
text id 3098
text id 3099
text id 3100
text id 3101
text id 3102
text id 3103
text id 3104
text id 3105
text id 3106
text id 3107
text id 3108
text id 3109
text id 3110
text id 3111
text id 3112
text id 3113
text id 3114
text id 3115
text id 3116
text id 3117
text id 3118
text id 3119
text id 3120
text id 3121
text id 3122
text id 3123
text id 3124
text id 3125
text id 3126
text id 3127
text id 3128
text id 3129
text id 3130
text id 3131
text id 3132
text id 3133
text id 3134
text id 3135



text id 3439
text id 3440
text id 3441
text id 3442
text id 3443
text id 3444
text id 3445
text id 3446
text id 3447
text id 3448
text id 3449
text id 3450
text id 3451
text id 3452
text id 3453
text id 3454
text id 3455
text id 3456
text id 3457
text id 3458
text id 3459
text id 3460
text id 3461
text id 3462
text id 3463
text id 3464
text id 3465
text id 3466
text id 3467
text id 3468
text id 3469
text id 3470
text id 3471
text id 3472
text id 3473
text id 3474
text id 3475
text id 3476
text id 3477
text id 3478
text id 3479
text id 3480
text id 3481
text id 3482
text id 3483
text id 3484
text id 3485
text id 3486
text id 3487
text id 3488
text id 3489
text id 3490
text id 3491
text id 3492
text id 3493
text id 3494
text id 3495
text id 3496
text id 3497
text id 3498
text id 3499
reconnecting to standford core server ... 
batch 8.0 from 4000
text id 3500
text id 3501
text id 3502
text id 3503
text id 3504
text id 3505
text id 3506
text id 3507
text id 3508
text id 3509
text id 3510
t

text id 4061
text id 4062
text id 4063
text id 4064
text id 4065
text id 4066
text id 4067
text id 4068
text id 4069
text id 4070
text id 4071
text id 4072
text id 4073
text id 4074
text id 4075
text id 4076
text id 4077
text id 4078
text id 4079
text id 4080
text id 4081
text id 4082
text id 4083
text id 4084
text id 4085
text id 4086
text id 4087
text id 4088
text id 4089
text id 4090
text id 4091
text id 4092
text id 4093
text id 4094
text id 4095
text id 4096
text id 4097
text id 4098
text id 4099
text id 4100
text id 4101
text id 4102
text id 4103
text id 4104
text id 4105
text id 4106
text id 4107
text id 4108
text id 4109
text id 4110
text id 4111
text id 4112
text id 4113
text id 4114
text id 4115
text id 4116
text id 4117
text id 4118
text id 4119
text id 4120
text id 4121
text id 4122
text id 4123
text id 4124
text id 4125
text id 4126
text id 4127
text id 4128
text id 4129
text id 4130
text id 4131
text id 4132
text id 4133
text id 4134
text id 4135
text id 4136
text id 4137

text id 4687
text id 4688
text id 4689
text id 4690
text id 4691
text id 4692
text id 4693
text id 4694
text id 4695
text id 4696
text id 4697
text id 4698
text id 4699
text id 4700
text id 4701
text id 4702
text id 4703
text id 4704
text id 4705
text id 4706
text id 4707
text id 4708
text id 4709
text id 4710
text id 4711
text id 4712
text id 4713
text id 4714
text id 4715
text id 4716
text id 4717
text id 4718
text id 4719
text id 4720
text id 4721
text id 4722
text id 4723
text id 4724
text id 4725
text id 4726
text id 4727
text id 4728
text id 4729
text id 4730
text id 4731
text id 4732
text id 4733
text id 4734
text id 4735
text id 4736
text id 4737
text id 4738
text id 4739
text id 4740
text id 4741
text id 4742
text id 4743
text id 4744
text id 4745
text id 4746
text id 4747
text id 4748
text id 4749
text id 4750
text id 4751
text id 4752
text id 4753
text id 4754
text id 4755
text id 4756
text id 4757
text id 4758
text id 4759
text id 4760
text id 4761
text id 4762
text id 4763

text id 5313
text id 5314
text id 5315
text id 5316
text id 5317
text id 5318
text id 5319
text id 5320
text id 5321
text id 5322
text id 5323
text id 5324
text id 5325
text id 5326
text id 5327
text id 5328
text id 5329
text id 5330
text id 5331
text id 5332
text id 5333
text id 5334
text id 5335
text id 5336
text id 5337
text id 5338
text id 5339
text id 5340
text id 5341
text id 5342
text id 5343
text id 5344
text id 5345
text id 5346
text id 5347
text id 5348
text id 5349
text id 5350
text id 5351
text id 5352
text id 5353
text id 5354
text id 5355
text id 5356
text id 5357
text id 5358
text id 5359
text id 5360
text id 5361
text id 5362
text id 5363
text id 5364
text id 5365
text id 5366
text id 5367
text id 5368
text id 5369
text id 5370
text id 5371
text id 5372
text id 5373
text id 5374
text id 5375
text id 5376
text id 5377
text id 5378
text id 5379
text id 5380
text id 5381
text id 5382
text id 5383
text id 5384
text id 5385
text id 5386
text id 5387
text id 5388
text id 5389



text id 5600
text id 5601
text id 5602
text id 5603
text id 5604
text id 5605
text id 5606
text id 5607
text id 5608
text id 5609
text id 5610
text id 5611
text id 5612
text id 5613
text id 5614
text id 5615
text id 5616
text id 5617
text id 5618
text id 5619
text id 5620
text id 5621
text id 5622
text id 5623
text id 5624
text id 5625
text id 5626
text id 5627
text id 5628
text id 5629
text id 5630
text id 5631
text id 5632
text id 5633
text id 5634
text id 5635
text id 5636
text id 5637
text id 5638
text id 5639
text id 5640
text id 5641
text id 5642
text id 5643
text id 5644
text id 5645
text id 5646
text id 5647
text id 5648
text id 5649
text id 5650
text id 5651
text id 5652
text id 5653
text id 5654
text id 5655
text id 5656
text id 5657
text id 5658
text id 5659
text id 5660
text id 5661
text id 5662
text id 5663
text id 5664
text id 5665
text id 5666
text id 5667
text id 5668
text id 5669
text id 5670
text id 5671
text id 5672
text id 5673
text id 5674
text id 5675
text id 5676

text id 6227
text id 6228
text id 6229
text id 6230
text id 6231
text id 6232
text id 6233
text id 6234
text id 6235
text id 6236
text id 6237
text id 6238
text id 6239
text id 6240
text id 6241
text id 6242
text id 6243
text id 6244
text id 6245
text id 6246
text id 6247
text id 6248
text id 6249
text id 6250
text id 6251
text id 6252
text id 6253
text id 6254
text id 6255
text id 6256
text id 6257
text id 6258
text id 6259
text id 6260
text id 6261
text id 6262
text id 6263
text id 6264
text id 6265
text id 6266
text id 6267
text id 6268
text id 6269
text id 6270
text id 6271
text id 6272
text id 6273
text id 6274
text id 6275
text id 6276
text id 6277
text id 6278
text id 6279
text id 6280
text id 6281
text id 6282
text id 6283
text id 6284
text id 6285
text id 6286
text id 6287
text id 6288
text id 6289
text id 6290
text id 6291
text id 6292
text id 6293
text id 6294
text id 6295
text id 6296
text id 6297
text id 6298
text id 6299
text id 6300
text id 6301
text id 6302
text id 6303

text id 6853
text id 6854
text id 6855
text id 6856
text id 6857
text id 6858
text id 6859
text id 6860
text id 6861
text id 6862
text id 6863
text id 6864
text id 6865
text id 6866
text id 6867
text id 6868
text id 6869
text id 6870
text id 6871
text id 6872
text id 6873
text id 6874
text id 6875
text id 6876
text id 6877
text id 6878
text id 6879
text id 6880
text id 6881
text id 6882
text id 6883
text id 6884
text id 6885
text id 6886
text id 6887
text id 6888
text id 6889
text id 6890
text id 6891
text id 6892
text id 6893
text id 6894
text id 6895
text id 6896
text id 6897
text id 6898
text id 6899
text id 6900
text id 6901
text id 6902
text id 6903
text id 6904
text id 6905
text id 6906
text id 6907
text id 6908
text id 6909
text id 6910
text id 6911
text id 6912
text id 6913
text id 6914
text id 6915
text id 6916
text id 6917
text id 6918
text id 6919
text id 6920
text id 6921
text id 6922
text id 6923
text id 6924
text id 6925
text id 6926
text id 6927
text id 6928
text id 6929

text id 7479
text id 7480
text id 7481
text id 7482
text id 7483
text id 7484
text id 7485
text id 7486
text id 7487
text id 7488
text id 7489
text id 7490
text id 7491
text id 7492
text id 7493
text id 7494
text id 7495
text id 7496
text id 7497
text id 7498
text id 7499
reconnecting to standford core server ... 
batch 16.0 from 8000
text id 7500
text id 7501
text id 7502
text id 7503
text id 7504
text id 7505
text id 7506
text id 7507
text id 7508
text id 7509
text id 7510
text id 7511
text id 7512
text id 7513
text id 7514
text id 7515
text id 7516
text id 7517
text id 7518
text id 7519
text id 7520
text id 7521
text id 7522
text id 7523
text id 7524
text id 7525
text id 7526
text id 7527
text id 7528
text id 7529
text id 7530
text id 7531
text id 7532
text id 7533
text id 7534
text id 7535
text id 7536
text id 7537
text id 7538
text id 7539
text id 7540
text id 7541
text id 7542
text id 7543
text id 7544
text id 7545
text id 7546
text id 7547
text id 7548
text id 7549
text id 7550


text id 8101
text id 8102
text id 8103
text id 8104
text id 8105
text id 8106
text id 8107
text id 8108
text id 8109
text id 8110
text id 8111
text id 8112
text id 8113
text id 8114
text id 8115
text id 8116
text id 8117
text id 8118
text id 8119
text id 8120
text id 8121
text id 8122
text id 8123
text id 8124
text id 8125
text id 8126
text id 8127
text id 8128
text id 8129
text id 8130
text id 8131
text id 8132
text id 8133
text id 8134
text id 8135
text id 8136
text id 8137
text id 8138
text id 8139
text id 8140
text id 8141
text id 8142
text id 8143
text id 8144
text id 8145
text id 8146
text id 8147
text id 8148
text id 8149
text id 8150
text id 8151
text id 8152
text id 8153
text id 8154
text id 8155
text id 8156
text id 8157
text id 8158
text id 8159
text id 8160
text id 8161
text id 8162
text id 8163
text id 8164
text id 8165
text id 8166
text id 8167
text id 8168
text id 8169
text id 8170
text id 8171
text id 8172
text id 8173
text id 8174
text id 8175
text id 8176
text id 8177

text id 8727
text id 8728
text id 8729
text id 8730
text id 8731
text id 8732
text id 8733
text id 8734
text id 8735
text id 8736
text id 8737
text id 8738
text id 8739
text id 8740
text id 8741
text id 8742
text id 8743
text id 8744
text id 8745
text id 8746
text id 8747
text id 8748
text id 8749
text id 8750
text id 8751
text id 8752
text id 8753
text id 8754
text id 8755
text id 8756
text id 8757
text id 8758
text id 8759
text id 8760
text id 8761
text id 8762
text id 8763
text id 8764
text id 8765
text id 8766
text id 8767
text id 8768
text id 8769
text id 8770
text id 8771
text id 8772
text id 8773
text id 8774
text id 8775
text id 8776
text id 8777
text id 8778
text id 8779
text id 8780
text id 8781
text id 8782
text id 8783
text id 8784
text id 8785
text id 8786
text id 8787
text id 8788
text id 8789
text id 8790
text id 8791
text id 8792
text id 8793
text id 8794
text id 8795
text id 8796
text id 8797
text id 8798
text id 8799
text id 8800
text id 8801
text id 8802
text id 8803

text id 9353
text id 9354
text id 9355
text id 9356
text id 9357
text id 9358
text id 9359
text id 9360
text id 9361
text id 9362
text id 9363
text id 9364
text id 9365
text id 9366
text id 9367
text id 9368
text id 9369
text id 9370
text id 9371
text id 9372
text id 9373
text id 9374
text id 9375
text id 9376
text id 9377
text id 9378
text id 9379
text id 9380
text id 9381
text id 9382
text id 9383
text id 9384
text id 9385
text id 9386
text id 9387
text id 9388
text id 9389
text id 9390
text id 9391
text id 9392
text id 9393
text id 9394
text id 9395
text id 9396
text id 9397
text id 9398
text id 9399
text id 9400
text id 9401
text id 9402
text id 9403
text id 9404
text id 9405
text id 9406
text id 9407
text id 9408
text id 9409
text id 9410
text id 9411
text id 9412
text id 9413
text id 9414
text id 9415
text id 9416
text id 9417
text id 9418
text id 9419
text id 9420
text id 9421
text id 9422
text id 9423
text id 9424
text id 9425
text id 9426
text id 9427
text id 9428
text id 9429

text id 9979
text id 9980
text id 9981
text id 9982
text id 9983
text id 9984
text id 9985
text id 9986
text id 9987
text id 9988
text id 9989
text id 9990
text id 9991
text id 9992
text id 9993
text id 9994
text id 9995
text id 9996
text id 9997
text id 9998
text id 9999
reconnecting to standford core server ... 
batch 21.0 from 10500
text id 10000
text id 10001
text id 10002
text id 10003
text id 10004
text id 10005
text id 10006
text id 10007
text id 10008
text id 10009
text id 10010
text id 10011
text id 10012
text id 10013
text id 10014
text id 10015
text id 10016
text id 10017
text id 10018
text id 10019
text id 10020
text id 10021
text id 10022
text id 10023
text id 10024
text id 10025
text id 10026
text id 10027
text id 10028
text id 10029
text id 10030
text id 10031
text id 10032
text id 10033
text id 10034
text id 10035
text id 10036
text id 10037
text id 10038
text id 10039
text id 10040
text id 10041
text id 10042
text id 10043
text id 10044
text id 10045
text id 10046
text


" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


text id 10201
text id 10202
text id 10203
text id 10204
text id 10205
text id 10206
text id 10207
text id 10208
text id 10209
text id 10210
text id 10211
text id 10212
text id 10213
text id 10214
text id 10215
text id 10216
text id 10217
text id 10218
text id 10219
text id 10220
text id 10221
text id 10222
text id 10223
text id 10224
text id 10225
text id 10226
text id 10227
text id 10228
text id 10229
text id 10230
text id 10231
text id 10232
text id 10233
text id 10234
text id 10235
text id 10236
text id 10237
text id 10238
text id 10239
text id 10240
text id 10241
text id 10242
text id 10243
text id 10244
text id 10245
text id 10246
text id 10247
text id 10248
text id 10249
text id 10250
text id 10251
text id 10252
text id 10253
text id 10254
text id 10255
text id 10256
text id 10257
text id 10258
text id 10259
text id 10260
text id 10261
text id 10262
text id 10263
text id 10264
text id 10265
text id 10266
text id 10267
text id 10268
text id 10269
text id 10270
text id 10271
text i

text id 10783
text id 10784
text id 10785
text id 10786
text id 10787
text id 10788
text id 10789
text id 10790
text id 10791
text id 10792
text id 10793
text id 10794
text id 10795
text id 10796
text id 10797
text id 10798
text id 10799
text id 10800
text id 10801
text id 10802
text id 10803
text id 10804
text id 10805
text id 10806
text id 10807
text id 10808
text id 10809
text id 10810
text id 10811
text id 10812
text id 10813
text id 10814
text id 10815
text id 10816
text id 10817
text id 10818
text id 10819
text id 10820
text id 10821
text id 10822
text id 10823
text id 10824
text id 10825
text id 10826
text id 10827
text id 10828
text id 10829
text id 10830
text id 10831
text id 10832
text id 10833
text id 10834
text id 10835
text id 10836
text id 10837
text id 10838
text id 10839
text id 10840
text id 10841
text id 10842
text id 10843
text id 10844
text id 10845
text id 10846
text id 10847
text id 10848
text id 10849
text id 10850
text id 10851
text id 10852
text id 10853
text i

text id 11364
text id 11365
text id 11366
text id 11367
text id 11368
text id 11369
text id 11370
text id 11371
text id 11372
text id 11373
text id 11374
text id 11375
text id 11376
text id 11377
text id 11378
text id 11379
text id 11380
text id 11381
text id 11382
text id 11383
text id 11384
text id 11385


https://www.youtube.com/watch?v=czcb5dXjlZA

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


text id 11386
text id 11387
text id 11388
text id 11389
text id 11390
text id 11391
text id 11392
text id 11393
text id 11394
text id 11395
text id 11396
text id 11397
text id 11398
text id 11399
text id 11400
text id 11401
text id 11402
text id 11403
text id 11404
text id 11405
text id 11406
text id 11407
text id 11408
text id 11409
text id 11410
text id 11411
text id 11412
text id 11413
text id 11414
text id 11415
text id 11416
text id 11417
text id 11418
text id 11419
text id 11420
text id 11421
text id 11422
text id 11423
text id 11424
text id 11425
text id 11426
text id 11427
text id 11428
text id 11429
text id 11430
text id 11431
text id 11432
text id 11433
text id 11434
text id 11435
text id 11436
text id 11437
text id 11438
text id 11439
text id 11440
text id 11441
text id 11442
text id 11443
text id 11444
text id 11445
text id 11446
text id 11447
text id 11448
text id 11449
text id 11450
text id 11451
text id 11452
text id 11453
text id 11454
text id 11455
text id 11456
text i

text id 11967
text id 11968
text id 11969
text id 11970
text id 11971
text id 11972
text id 11973
text id 11974
text id 11975
text id 11976
text id 11977
text id 11978
text id 11979
text id 11980
text id 11981
text id 11982
text id 11983
text id 11984
text id 11985
text id 11986
text id 11987
text id 11988
text id 11989
text id 11990
text id 11991
text id 11992
text id 11993
text id 11994
text id 11995
text id 11996
text id 11997
text id 11998
text id 11999
reconnecting to standford core server ... 
batch 25.0 from 12500
text id 12000
text id 12001
text id 12002
text id 12003
text id 12004
text id 12005
text id 12006
text id 12007
text id 12008
text id 12009
text id 12010
text id 12011
text id 12012
text id 12013
text id 12014
text id 12015
text id 12016
text id 12017
text id 12018
text id 12019
text id 12020
text id 12021
text id 12022
text id 12023
text id 12024
text id 12025
text id 12026
text id 12027
text id 12028
text id 12029
text id 12030
text id 12031
text id 12032
text id 120

text id 12543
text id 12544
text id 12545
text id 12546
text id 12547
text id 12548
text id 12549
text id 12550
text id 12551
text id 12552
text id 12553
text id 12554
text id 12555
text id 12556
text id 12557
text id 12558
text id 12559
text id 12560
text id 12561
text id 12562
text id 12563
text id 12564
text id 12565
text id 12566
text id 12567
text id 12568
text id 12569
text id 12570
text id 12571
text id 12572
text id 12573
text id 12574
text id 12575
text id 12576
text id 12577
text id 12578
text id 12579
text id 12580
text id 12581
text id 12582
text id 12583
text id 12584
text id 12585
text id 12586
text id 12587
text id 12588
text id 12589
text id 12590
text id 12591
text id 12592
text id 12593
text id 12594
text id 12595
text id 12596
text id 12597
text id 12598
text id 12599
text id 12600
text id 12601
text id 12602
text id 12603
text id 12604
text id 12605
text id 12606
text id 12607
text id 12608
text id 12609
text id 12610
text id 12611
text id 12612
text id 12613
text i



text id 12751
text id 12752
text id 12753
text id 12754
text id 12755
text id 12756
text id 12757
text id 12758
text id 12759
text id 12760
text id 12761
text id 12762
text id 12763
text id 12764
text id 12765
text id 12766
text id 12767
text id 12768
text id 12769
text id 12770
text id 12771
text id 12772
text id 12773
text id 12774
text id 12775
text id 12776
text id 12777
text id 12778
text id 12779
text id 12780
text id 12781
text id 12782
text id 12783
text id 12784
text id 12785
text id 12786
text id 12787
text id 12788
text id 12789
text id 12790
text id 12791
text id 12792
text id 12793
text id 12794
text id 12795
text id 12796
text id 12797
text id 12798
text id 12799
text id 12800
text id 12801
text id 12802
text id 12803
text id 12804
text id 12805
text id 12806
text id 12807
text id 12808
text id 12809
text id 12810
text id 12811
text id 12812
text id 12813
text id 12814
text id 12815
text id 12816
text id 12817
text id 12818
text id 12819
text id 12820
text id 12821
text i

text id 13332
text id 13333
text id 13334
text id 13335
text id 13336
text id 13337
text id 13338
text id 13339
text id 13340
text id 13341
text id 13342
text id 13343
text id 13344
text id 13345
text id 13346
text id 13347
text id 13348
text id 13349
text id 13350
text id 13351
text id 13352
text id 13353
text id 13354
text id 13355
text id 13356
text id 13357
text id 13358
text id 13359
text id 13360
text id 13361
text id 13362
text id 13363
text id 13364
text id 13365
text id 13366
text id 13367
text id 13368
text id 13369
text id 13370
text id 13371
text id 13372
text id 13373
text id 13374
text id 13375
text id 13376
text id 13377
text id 13378
text id 13379
text id 13380
text id 13381
text id 13382
text id 13383
text id 13384
text id 13385
text id 13386
text id 13387
text id 13388
text id 13389
text id 13390
text id 13391
text id 13392
text id 13393
text id 13394
text id 13395
text id 13396
text id 13397
text id 13398
text id 13399
text id 13400
text id 13401
text id 13402
text i

text id 13913
text id 13914
text id 13915
text id 13916
text id 13917
text id 13918
text id 13919
text id 13920
text id 13921
text id 13922
text id 13923
text id 13924
text id 13925
text id 13926
text id 13927
text id 13928
text id 13929
text id 13930
text id 13931
text id 13932
text id 13933
text id 13934
text id 13935
text id 13936
text id 13937
text id 13938
text id 13939
text id 13940
text id 13941
text id 13942
text id 13943
text id 13944
text id 13945
text id 13946
text id 13947
text id 13948
text id 13949
text id 13950
text id 13951
text id 13952
text id 13953
text id 13954
text id 13955
text id 13956
text id 13957
text id 13958
text id 13959
text id 13960
text id 13961
text id 13962
text id 13963
text id 13964
text id 13965
text id 13966
text id 13967
text id 13968
text id 13969
text id 13970
text id 13971
text id 13972
text id 13973
text id 13974
text id 13975
text id 13976
text id 13977
text id 13978
text id 13979
text id 13980
text id 13981
text id 13982
text id 13983
text i

text id 14495
text id 14496
text id 14497
text id 14498
text id 14499
reconnecting to standford core server ... 
batch 30.0 from 15000
text id 14500
text id 14501
text id 14502
text id 14503
text id 14504
text id 14505
text id 14506
text id 14507
text id 14508
text id 14509
text id 14510
text id 14511
text id 14512
text id 14513
text id 14514
text id 14515
text id 14516
text id 14517
text id 14518
text id 14519
text id 14520
text id 14521
text id 14522
text id 14523
text id 14524
text id 14525
text id 14526
text id 14527
text id 14528
text id 14529
text id 14530
text id 14531
text id 14532
text id 14533
text id 14534
text id 14535
text id 14536
text id 14537
text id 14538
text id 14539
text id 14540
text id 14541
text id 14542
text id 14543
text id 14544
text id 14545
text id 14546
text id 14547
text id 14548
text id 14549
text id 14550
text id 14551
text id 14552
text id 14553
text id 14554
text id 14555
text id 14556
text id 14557
text id 14558
text id 14559
text id 14560
text id 145

## Shallow extraction

In [45]:
%%time
result_dict_shallow = process_text_split_sentence_sf(df, False, True)

Number of text 15300
batch 1.0 from 500
text id 0
text id 1
text id 2
text id 3
text id 4
text id 5
text id 6
text id 7
text id 8
text id 9
text id 10
text id 11
text id 12
text id 13
text id 14
text id 15
text id 16
text id 17
text id 18
text id 19
text id 20
text id 21
text id 22
text id 23
text id 24
text id 25
text id 26
text id 27
text id 28
text id 29
text id 30
text id 31
text id 32
text id 33
text id 34
text id 35
text id 36
text id 37
text id 38
text id 39
text id 40
text id 41
text id 42
text id 43
text id 44
text id 45
text id 46
text id 47
text id 48
text id 49
text id 50
text id 51
text id 52
text id 53
text id 54
text id 55
text id 56
text id 57
text id 58
text id 59
text id 60
text id 61
text id 62
text id 63
text id 64
text id 65
text id 66
text id 67
text id 68
text id 69
text id 70
text id 71
text id 72
text id 73
text id 74
text id 75
text id 76
text id 77
text id 78
text id 79
text id 80
text id 81
text id 82
text id 83
text id 84
text id 85
text id 86
text id 87
te

text id 684
text id 685
text id 686
text id 687
text id 688
text id 689
text id 690
text id 691
text id 692
text id 693
text id 694
text id 695
text id 696
text id 697
text id 698
text id 699
text id 700
text id 701
text id 702
text id 703
text id 704
text id 705
text id 706
text id 707
text id 708
text id 709
text id 710
text id 711
text id 712
text id 713
text id 714
text id 715
text id 716
text id 717
text id 718
text id 719
text id 720
text id 721
text id 722
text id 723
text id 724
text id 725
text id 726
text id 727
text id 728
text id 729
text id 730
text id 731
text id 732
text id 733
text id 734
text id 735
text id 736
text id 737
text id 738
text id 739
text id 740
text id 741
text id 742
text id 743
text id 744
text id 745
text id 746
text id 747
text id 748
text id 749
text id 750
text id 751
text id 752
text id 753
text id 754
text id 755
text id 756
text id 757
text id 758
text id 759
text id 760
text id 761
text id 762
text id 763
text id 764
text id 765
text id 766
text

text id 1334
text id 1335
text id 1336
text id 1337
text id 1338
text id 1339
text id 1340
text id 1341
text id 1342
text id 1343
text id 1344
text id 1345
text id 1346
text id 1347
text id 1348
text id 1349
text id 1350
text id 1351
text id 1352
text id 1353
text id 1354
text id 1355
text id 1356
text id 1357
text id 1358
text id 1359
text id 1360
text id 1361
text id 1362
text id 1363
text id 1364
text id 1365
text id 1366
text id 1367
text id 1368
text id 1369
text id 1370
text id 1371
text id 1372
text id 1373
text id 1374
text id 1375
text id 1376
text id 1377
text id 1378
text id 1379
text id 1380
text id 1381
text id 1382
text id 1383
text id 1384
text id 1385
text id 1386
text id 1387
text id 1388
text id 1389
text id 1390
text id 1391
text id 1392
text id 1393
text id 1394
text id 1395
text id 1396
text id 1397
text id 1398
text id 1399
text id 1400
text id 1401
text id 1402
text id 1403
text id 1404
text id 1405
text id 1406
text id 1407
text id 1408
text id 1409
text id 1410

text id 1962
text id 1963
text id 1964
text id 1965
text id 1966
text id 1967
text id 1968
text id 1969
text id 1970
text id 1971
text id 1972
text id 1973
text id 1974
text id 1975
text id 1976
text id 1977
text id 1978
text id 1979
text id 1980
text id 1981
text id 1982
text id 1983
text id 1984
text id 1985
text id 1986
text id 1987
text id 1988
text id 1989
text id 1990
text id 1991
text id 1992
text id 1993
text id 1994
text id 1995
text id 1996
text id 1997
text id 1998
text id 1999
reconnecting to standford core server ... 
batch 5.0 from 2500
text id 2000
text id 2001
text id 2002
text id 2003
text id 2004
text id 2005
text id 2006
text id 2007
text id 2008
text id 2009
text id 2010
text id 2011
text id 2012
text id 2013
text id 2014
text id 2015
text id 2016
text id 2017
text id 2018
text id 2019
text id 2020
text id 2021
text id 2022
text id 2023
text id 2024
text id 2025
text id 2026
text id 2027
text id 2028
text id 2029
text id 2030
text id 2031
text id 2032
text id 2033
t

text id 2585
text id 2586
text id 2587
text id 2588
text id 2589
text id 2590
text id 2591
text id 2592
text id 2593
text id 2594
text id 2595
text id 2596
text id 2597
text id 2598
text id 2599
text id 2600
text id 2601
text id 2602
text id 2603
text id 2604
text id 2605
text id 2606
text id 2607
text id 2608
text id 2609
text id 2610
text id 2611
text id 2612
text id 2613
text id 2614
text id 2615
text id 2616
text id 2617
text id 2618
text id 2619
text id 2620
text id 2621
text id 2622
text id 2623
text id 2624
text id 2625
text id 2626
text id 2627
text id 2628
text id 2629
text id 2630
text id 2631
text id 2632
text id 2633
text id 2634
text id 2635
text id 2636
text id 2637
text id 2638
text id 2639
text id 2640
text id 2641
text id 2642
text id 2643
text id 2644
text id 2645
text id 2646
text id 2647
text id 2648
text id 2649
text id 2650
text id 2651
text id 2652
text id 2653
text id 2654
text id 2655
text id 2656
text id 2657
text id 2658
text id 2659
text id 2660
text id 2661

text id 3211
text id 3212
text id 3213
text id 3214
text id 3215
text id 3216
text id 3217
text id 3218
text id 3219
text id 3220
text id 3221
text id 3222
text id 3223
text id 3224
text id 3225
text id 3226
text id 3227
text id 3228
text id 3229
text id 3230
text id 3231
text id 3232
text id 3233
text id 3234
text id 3235
text id 3236
text id 3237
text id 3238
text id 3239
text id 3240
text id 3241
text id 3242
text id 3243
text id 3244
text id 3245
text id 3246
text id 3247
text id 3248
text id 3249
text id 3250
text id 3251
text id 3252
text id 3253
text id 3254
text id 3255
text id 3256
text id 3257
text id 3258
text id 3259
text id 3260
text id 3261
text id 3262
text id 3263
text id 3264
text id 3265
text id 3266
text id 3267
text id 3268
text id 3269
text id 3270
text id 3271
text id 3272
text id 3273
text id 3274
text id 3275
text id 3276
text id 3277
text id 3278
text id 3279
text id 3280
text id 3281
text id 3282
text id 3283
text id 3284
text id 3285
text id 3286
text id 3287

text id 3839
text id 3840
text id 3841
text id 3842
text id 3843
text id 3844
text id 3845
text id 3846
text id 3847
text id 3848
text id 3849
text id 3850
text id 3851
text id 3852
text id 3853
text id 3854
text id 3855
text id 3856
text id 3857
text id 3858
text id 3859
text id 3860
text id 3861
text id 3862
text id 3863
text id 3864
text id 3865
text id 3866
text id 3867
text id 3868
text id 3869
text id 3870
text id 3871
text id 3872
text id 3873
text id 3874
text id 3875
text id 3876
text id 3877
text id 3878
text id 3879
text id 3880
text id 3881
text id 3882
text id 3883
text id 3884
text id 3885
text id 3886
text id 3887
text id 3888
text id 3889
text id 3890
text id 3891
text id 3892
text id 3893
text id 3894
text id 3895
text id 3896
text id 3897
text id 3898
text id 3899
text id 3900
text id 3901
text id 3902
text id 3903
text id 3904
text id 3905
text id 3906
text id 3907
text id 3908
text id 3909
text id 3910
text id 3911
text id 3912
text id 3913
text id 3914
text id 3915

text id 4465
text id 4466
text id 4467
text id 4468
text id 4469
text id 4470
text id 4471
text id 4472
text id 4473
text id 4474
text id 4475
text id 4476
text id 4477
text id 4478
text id 4479
text id 4480
text id 4481
text id 4482
text id 4483
text id 4484
text id 4485
text id 4486
text id 4487
text id 4488
text id 4489
text id 4490
text id 4491
text id 4492
text id 4493
text id 4494
text id 4495
text id 4496
text id 4497
text id 4498
text id 4499
reconnecting to standford core server ... 
batch 10.0 from 5000
text id 4500
text id 4501
text id 4502
text id 4503
text id 4504
text id 4505
text id 4506
text id 4507
text id 4508
text id 4509
text id 4510
text id 4511
text id 4512
text id 4513
text id 4514
text id 4515
text id 4516
text id 4517
text id 4518
text id 4519
text id 4520
text id 4521
text id 4522
text id 4523
text id 4524
text id 4525
text id 4526
text id 4527
text id 4528
text id 4529
text id 4530
text id 4531
text id 4532
text id 4533
text id 4534
text id 4535
text id 4536


text id 5086
text id 5087
text id 5088
text id 5089
text id 5090
text id 5091
text id 5092
text id 5093
text id 5094
text id 5095
text id 5096
text id 5097
text id 5098
text id 5099
text id 5100
text id 5101
text id 5102
text id 5103
text id 5104
text id 5105
text id 5106
text id 5107
text id 5108
text id 5109
text id 5110
text id 5111
text id 5112
text id 5113
text id 5114
text id 5115
text id 5116
text id 5117
text id 5118
text id 5119
text id 5120
text id 5121
text id 5122
text id 5123
text id 5124
text id 5125
text id 5126
text id 5127
text id 5128
text id 5129
text id 5130
text id 5131
text id 5132
text id 5133
text id 5134
text id 5135
text id 5136
text id 5137
text id 5138
text id 5139
text id 5140
text id 5141
text id 5142
text id 5143
text id 5144
text id 5145
text id 5146
text id 5147
text id 5148
text id 5149
text id 5150
text id 5151
text id 5152
text id 5153
text id 5154
text id 5155
text id 5156
text id 5157
text id 5158
text id 5159
text id 5160
text id 5161
text id 5162

text id 5712
text id 5713
text id 5714
text id 5715
text id 5716
text id 5717
text id 5718
text id 5719
text id 5720
text id 5721
text id 5722
text id 5723
text id 5724
text id 5725
text id 5726
text id 5727
text id 5728
text id 5729
text id 5730
text id 5731
text id 5732
text id 5733
text id 5734
text id 5735
text id 5736
text id 5737
text id 5738
text id 5739
text id 5740
text id 5741
text id 5742
text id 5743
text id 5744
text id 5745
text id 5746
text id 5747
text id 5748
text id 5749
text id 5750
text id 5751
text id 5752
text id 5753
text id 5754
text id 5755
text id 5756
text id 5757
text id 5758
text id 5759
text id 5760
text id 5761
text id 5762
text id 5763
text id 5764
text id 5765
text id 5766
text id 5767
text id 5768
text id 5769
text id 5770
text id 5771
text id 5772
text id 5773
text id 5774
text id 5775
text id 5776
text id 5777
text id 5778
text id 5779
text id 5780
text id 5781
text id 5782
text id 5783
text id 5784
text id 5785
text id 5786
text id 5787
text id 5788

text id 6341
text id 6342
text id 6343
text id 6344
text id 6345
text id 6346
text id 6347
text id 6348
text id 6349
text id 6350
text id 6351
text id 6352
text id 6353
text id 6354
text id 6355
text id 6356
text id 6357
text id 6358
text id 6359
text id 6360
text id 6361
text id 6362
text id 6363
text id 6364
text id 6365
text id 6366
text id 6367
text id 6368
text id 6369
text id 6370
text id 6371
text id 6372
text id 6373
text id 6374
text id 6375
text id 6376
text id 6377
text id 6378
text id 6379
text id 6380
text id 6381
text id 6382
text id 6383
text id 6384
text id 6385
text id 6386
text id 6387
text id 6388
text id 6389
text id 6390
text id 6391
text id 6392
text id 6393
text id 6394
text id 6395
text id 6396
text id 6397
text id 6398
text id 6399
text id 6400
text id 6401
text id 6402
text id 6403
text id 6404
text id 6405
text id 6406
text id 6407
text id 6408
text id 6409
text id 6410
text id 6411
text id 6412
text id 6413
text id 6414
text id 6415
text id 6416
text id 6417

text id 6969
text id 6970
text id 6971
text id 6972
text id 6973
text id 6974
text id 6975
text id 6976
text id 6977
text id 6978
text id 6979
text id 6980
text id 6981
text id 6982
text id 6983
text id 6984
text id 6985
text id 6986
text id 6987
text id 6988
text id 6989
text id 6990
text id 6991
text id 6992
text id 6993
text id 6994
text id 6995
text id 6996
text id 6997
text id 6998
text id 6999
reconnecting to standford core server ... 
batch 15.0 from 7500
text id 7000
text id 7001
text id 7002
text id 7003
text id 7004
text id 7005
text id 7006
text id 7007
text id 7008
text id 7009
text id 7010
text id 7011
text id 7012
text id 7013
text id 7014
text id 7015
text id 7016
text id 7017
text id 7018
text id 7019
text id 7020
text id 7021
text id 7022
text id 7023
text id 7024
text id 7025
text id 7026
text id 7027
text id 7028
text id 7029
text id 7030
text id 7031
text id 7032
text id 7033
text id 7034
text id 7035
text id 7036
text id 7037
text id 7038
text id 7039
text id 7040


text id 7590
text id 7591
text id 7592
text id 7593
text id 7594
text id 7595
text id 7596
text id 7597
text id 7598
text id 7599
text id 7600
text id 7601
text id 7602
text id 7603
text id 7604
text id 7605
text id 7606
text id 7607
text id 7608
text id 7609
text id 7610
text id 7611
text id 7612
text id 7613
text id 7614
text id 7615
text id 7616
text id 7617
text id 7618
text id 7619
text id 7620
text id 7621
text id 7622
text id 7623
text id 7624
text id 7625
text id 7626
text id 7627
text id 7628
text id 7629
text id 7630
text id 7631
text id 7632
text id 7633
text id 7634
text id 7635
text id 7636
text id 7637
text id 7638
text id 7639
text id 7640
text id 7641
text id 7642
text id 7643
text id 7644
text id 7645
text id 7646
text id 7647
text id 7648
text id 7649
text id 7650
text id 7651
text id 7652
text id 7653
text id 7654
text id 7655
text id 7656
text id 7657
text id 7658
text id 7659
text id 7660
text id 7661
text id 7662
text id 7663
text id 7664
text id 7665
text id 7666

text id 8216
text id 8217
text id 8218
text id 8219
text id 8220
text id 8221
text id 8222
text id 8223
text id 8224
text id 8225
text id 8226
text id 8227
text id 8228
text id 8229
text id 8230
text id 8231
text id 8232
text id 8233
text id 8234
text id 8235
text id 8236
text id 8237
text id 8238
text id 8239
text id 8240
text id 8241
text id 8242
text id 8243
text id 8244
text id 8245
text id 8246
text id 8247
text id 8248
text id 8249
text id 8250
text id 8251
text id 8252
text id 8253
text id 8254
text id 8255
text id 8256
text id 8257
text id 8258
text id 8259
text id 8260
text id 8261
text id 8262
text id 8263
text id 8264
text id 8265
text id 8266
text id 8267
text id 8268
text id 8269
text id 8270
text id 8271
text id 8272
text id 8273
text id 8274
text id 8275
text id 8276
text id 8277
text id 8278
text id 8279
text id 8280
text id 8281
text id 8282
text id 8283
text id 8284
text id 8285
text id 8286
text id 8287
text id 8288
text id 8289
text id 8290
text id 8291
text id 8292

text id 8843
text id 8844
text id 8845
text id 8846
text id 8847
text id 8848
text id 8849
text id 8850
text id 8851
text id 8852
text id 8853
text id 8854
text id 8855
text id 8856
text id 8857
text id 8858
text id 8859
text id 8860
text id 8861
text id 8862
text id 8863
text id 8864
text id 8865
text id 8866
text id 8867
text id 8868
text id 8869
text id 8870
text id 8871
text id 8872
text id 8873
text id 8874
text id 8875
text id 8876
text id 8877
text id 8878
text id 8879
text id 8880
text id 8881
text id 8882
text id 8883
text id 8884
text id 8885
text id 8886
text id 8887
text id 8888
text id 8889
text id 8890
text id 8891
text id 8892
text id 8893
text id 8894
text id 8895
text id 8896
text id 8897
text id 8898
text id 8899
text id 8900
text id 8901
text id 8902
text id 8903
text id 8904
text id 8905
text id 8906
text id 8907
text id 8908
text id 8909
text id 8910
text id 8911
text id 8912
text id 8913
text id 8914
text id 8915
text id 8916
text id 8917
text id 8918
text id 8919

text id 9469
text id 9470
text id 9471
text id 9472
text id 9473
text id 9474
text id 9475
text id 9476
text id 9477
text id 9478
text id 9479
text id 9480
text id 9481
text id 9482
text id 9483
text id 9484
text id 9485
text id 9486
text id 9487
text id 9488
text id 9489
text id 9490
text id 9491
text id 9492
text id 9493
text id 9494
text id 9495
text id 9496
text id 9497
text id 9498
text id 9499
reconnecting to standford core server ... 
batch 20.0 from 10000
text id 9500
text id 9501
text id 9502
text id 9503
text id 9504
text id 9505
text id 9506
text id 9507
text id 9508
text id 9509
text id 9510
text id 9511
text id 9512
text id 9513
text id 9514
text id 9515
text id 9516
text id 9517
text id 9518
text id 9519
text id 9520
text id 9521
text id 9522
text id 9523
text id 9524
text id 9525
text id 9526
text id 9527
text id 9528
text id 9529
text id 9530
text id 9531
text id 9532
text id 9533
text id 9534
text id 9535
text id 9536
text id 9537
text id 9538
text id 9539
text id 9540

text id 10083
text id 10084
text id 10085
text id 10086
text id 10087
text id 10088
text id 10089
text id 10090
text id 10091
text id 10092
text id 10093
text id 10094
text id 10095
text id 10096
text id 10097
text id 10098
text id 10099
text id 10100
text id 10101
text id 10102
text id 10103
text id 10104
text id 10105
text id 10106
text id 10107
text id 10108
text id 10109
text id 10110
text id 10111
text id 10112
text id 10113
text id 10114
text id 10115
text id 10116
text id 10117
text id 10118
text id 10119
text id 10120
text id 10121
text id 10122
text id 10123
text id 10124
text id 10125
text id 10126
text id 10127
text id 10128
text id 10129
text id 10130
text id 10131
text id 10132
text id 10133
text id 10134
text id 10135
text id 10136
text id 10137
text id 10138
text id 10139
text id 10140
text id 10141
text id 10142
text id 10143
text id 10144
text id 10145
text id 10146
text id 10147
text id 10148
text id 10149
text id 10150
text id 10151
text id 10152
text id 10153
text i

text id 10665
text id 10666
text id 10667
text id 10668
text id 10669
text id 10670
text id 10671
text id 10672
text id 10673
text id 10674
text id 10675
text id 10676
text id 10677
text id 10678
text id 10679
text id 10680
text id 10681
text id 10682
text id 10683
text id 10684
text id 10685
text id 10686
text id 10687
text id 10688
text id 10689
text id 10690
text id 10691
text id 10692
text id 10693
text id 10694
text id 10695
text id 10696
text id 10697
text id 10698
text id 10699
text id 10700
text id 10701
text id 10702
text id 10703
text id 10704
text id 10705
text id 10706
text id 10707
text id 10708
text id 10709
text id 10710
text id 10711
text id 10712
text id 10713
text id 10714
text id 10715
text id 10716
text id 10717
text id 10718
text id 10719
text id 10720
text id 10721
text id 10722
text id 10723
text id 10724
text id 10725
text id 10726
text id 10727
text id 10728
text id 10729
text id 10730
text id 10731
text id 10732
text id 10733
text id 10734
text id 10735
text i

text id 11247
text id 11248
text id 11249
text id 11250
text id 11251
text id 11252
text id 11253
text id 11254
text id 11255
text id 11256
text id 11257
text id 11258
text id 11259
text id 11260
text id 11261
text id 11262
text id 11263
text id 11264
text id 11265
text id 11266
text id 11267
text id 11268
text id 11269
text id 11270
text id 11271
text id 11272
text id 11273
text id 11274
text id 11275
text id 11276
text id 11277
text id 11278
text id 11279
text id 11280
text id 11281
text id 11282
text id 11283
text id 11284
text id 11285
text id 11286
text id 11287
text id 11288
text id 11289
text id 11290
text id 11291
text id 11292
text id 11293
text id 11294
text id 11295
text id 11296
text id 11297
text id 11298
text id 11299
text id 11300
text id 11301
text id 11302
text id 11303
text id 11304
text id 11305
text id 11306
text id 11307
text id 11308
text id 11309
text id 11310
text id 11311
text id 11312
text id 11313
text id 11314
text id 11315
text id 11316
text id 11317
text i

text id 11828
text id 11829
text id 11830
text id 11831
text id 11832
text id 11833
text id 11834
text id 11835
text id 11836
text id 11837
text id 11838
text id 11839
text id 11840
text id 11841
text id 11842
text id 11843
text id 11844
text id 11845
text id 11846
text id 11847
text id 11848
text id 11849
text id 11850
text id 11851
text id 11852
text id 11853
text id 11854
text id 11855
text id 11856
text id 11857
text id 11858
text id 11859
text id 11860
text id 11861
text id 11862
text id 11863
text id 11864
text id 11865
text id 11866
text id 11867
text id 11868
text id 11869
text id 11870
text id 11871
text id 11872
text id 11873
text id 11874
text id 11875
text id 11876
text id 11877
text id 11878
text id 11879
text id 11880
text id 11881
text id 11882
text id 11883
text id 11884
text id 11885
text id 11886
text id 11887
text id 11888
text id 11889
text id 11890
text id 11891
text id 11892
text id 11893
text id 11894
text id 11895
text id 11896
text id 11897
text id 11898
text i

text id 12409
text id 12410
text id 12411
text id 12412
text id 12413
text id 12414
text id 12415
text id 12416
text id 12417
text id 12418
text id 12419
text id 12420
text id 12421
text id 12422
text id 12423
text id 12424
text id 12425
text id 12426
text id 12427
text id 12428
text id 12429
text id 12430
text id 12431
text id 12432
text id 12433
text id 12434
text id 12435
text id 12436
text id 12437
text id 12438
text id 12439
text id 12440
text id 12441
text id 12442
text id 12443
text id 12444
text id 12445
text id 12446
text id 12447
text id 12448
text id 12449
text id 12450
text id 12451
text id 12452
text id 12453
text id 12454
text id 12455
text id 12456
text id 12457
text id 12458
text id 12459
text id 12460
text id 12461
text id 12462
text id 12463
text id 12464
text id 12465
text id 12466
text id 12467
text id 12468
text id 12469
text id 12470
text id 12471
text id 12472
text id 12473
text id 12474
text id 12475
text id 12476
text id 12477
text id 12478
text id 12479
text i

text id 12991
text id 12992
text id 12993
text id 12994
text id 12995
text id 12996
text id 12997
text id 12998
text id 12999
reconnecting to standford core server ... 
batch 27.0 from 13500
text id 13000
text id 13001
text id 13002
text id 13003
text id 13004
text id 13005
text id 13006
text id 13007
text id 13008
text id 13009
text id 13010
text id 13011
text id 13012
text id 13013
text id 13014
text id 13015
text id 13016
text id 13017
text id 13018
text id 13019
text id 13020
text id 13021
text id 13022
text id 13023
text id 13024
text id 13025
text id 13026
text id 13027
text id 13028
text id 13029
text id 13030
text id 13031
text id 13032
text id 13033
text id 13034
text id 13035
text id 13036
text id 13037
text id 13038
text id 13039
text id 13040
text id 13041
text id 13042
text id 13043
text id 13044
text id 13045
text id 13046
text id 13047
text id 13048
text id 13049
text id 13050
text id 13051
text id 13052
text id 13053
text id 13054
text id 13055
text id 13056
text id 130

text id 13567
text id 13568
text id 13569
text id 13570
text id 13571
text id 13572
text id 13573
text id 13574
text id 13575
text id 13576
text id 13577
text id 13578
text id 13579
text id 13580
text id 13581
text id 13582
text id 13583
text id 13584
text id 13585
text id 13586
text id 13587
text id 13588
text id 13589
text id 13590
text id 13591
text id 13592
text id 13593
text id 13594
text id 13595
text id 13596
text id 13597
text id 13598
text id 13599
text id 13600
text id 13601
text id 13602
text id 13603
text id 13604
text id 13605
text id 13606
text id 13607
text id 13608
text id 13609
text id 13610
text id 13611
text id 13612
text id 13613
text id 13614
text id 13615
text id 13616
text id 13617
text id 13618
text id 13619
text id 13620
text id 13621
text id 13622
text id 13623
text id 13624
text id 13625
text id 13626
text id 13627
text id 13628
text id 13629
text id 13630
text id 13631
text id 13632
text id 13633
text id 13634
text id 13635
text id 13636
text id 13637
text i

text id 14148
text id 14149
text id 14150
text id 14151
text id 14152
text id 14153
text id 14154
text id 14155
text id 14156
text id 14157
text id 14158
text id 14159
text id 14160
text id 14161
text id 14162
text id 14163
text id 14164
text id 14165
text id 14166
text id 14167
text id 14168
text id 14169
text id 14170
text id 14171
text id 14172
text id 14173
text id 14174
text id 14175
text id 14176
text id 14177
text id 14178
text id 14179
text id 14180
text id 14181
text id 14182
text id 14183
text id 14184
text id 14185
text id 14186
text id 14187
text id 14188
text id 14189
text id 14190
text id 14191
text id 14192
text id 14193
text id 14194
text id 14195
text id 14196
text id 14197
text id 14198
text id 14199
text id 14200
text id 14201
text id 14202
text id 14203
text id 14204
text id 14205
text id 14206
text id 14207
text id 14208
text id 14209
text id 14210
text id 14211
text id 14212
text id 14213
text id 14214
text id 14215
text id 14216
text id 14217
text id 14218
text i

text id 14729
text id 14730
text id 14731
text id 14732
text id 14733
text id 14734
text id 14735
text id 14736
text id 14737
text id 14738
text id 14739
text id 14740
text id 14741
text id 14742
text id 14743
text id 14744
text id 14745
text id 14746
text id 14747
text id 14748
text id 14749
text id 14750
text id 14751
text id 14752
text id 14753
text id 14754
text id 14755
text id 14756
text id 14757
text id 14758
text id 14759
text id 14760
text id 14761
text id 14762
text id 14763
text id 14764
text id 14765
text id 14766
text id 14767
text id 14768
text id 14769
text id 14770
text id 14771
text id 14772
text id 14773
text id 14774
text id 14775
text id 14776
text id 14777
text id 14778
text id 14779
text id 14780
text id 14781
text id 14782
text id 14783
text id 14784
text id 14785
text id 14786
text id 14787
text id 14788
text id 14789
text id 14790
text id 14791
text id 14792
text id 14793
text id 14794
text id 14795
text id 14796
text id 14797
text id 14798
text id 14799
text i

# Post processing of adjp from deep extraction

In [159]:
result_dict_deep_p = postprocessing_dict(result_dict_deep)

dict size before processing: 26827
Number of changes: 86
dict size aftter processing: 24191


In [210]:
sort_dict(result_dict_deep_p)

{'much better': 166,
 'so much': 156,
 'so many': 148,
 'not sure': 93,
 'how much': 90,
 'as good': 70,
 'very busy': 69,
 'well worth': 61,
 'super nice': 59,
 'how many': 57,
 'quite good': 56,
 'good too': 56,
 'pretty decent': 55,
 'really great': 54,
 'worth it': 53,
 'very little': 51,
 'very small': 51,
 'highly recommended': 50,
 'even better': 49,
 'too busy': 47,
 'friendly and helpful': 44,
 'great too': 41,
 'how good': 41,
 'too many': 41,
 'just okay': 41,
 'needless to say': 38,
 'really cool': 37,
 'very sweet': 36,
 'very large': 35,
 'very accommodating': 34,
 'very comfortable': 34,
 'way better': 33,
 'too long': 32,
 'so much better': 32,
 'so sweet': 32,
 'a little more': 32,
 'fresh and delicious': 32,
 'really bad': 31,
 'so fresh': 31,
 'consistently good': 31,
 'really friendly': 30,
 'not worth': 30,
 'hot and sour': 30,
 'really busy': 29,
 'most recent': 29,
 'as much': 28,
 'super busy': 28,
 'pretty tasty': 28,
 'fast and friendly': 27,
 'extremely frien

In [215]:
result_deep_df = pd.DataFrame(result_dict_deep_p, index=['frequency']).transpose()
result_deep_df.to_csv('../result/frequency_result_deep_extraction.csv')

# Get frequently-used ADJP deep extraction

In [160]:
frequently_used_adjp_deep = get_frequently_used_adjp(result_dict_deep_p)

0.05
Number of adjp that are top 5% 1210
Count for top 5% 3
Num adjp considered common: 1493


In [221]:
frequently_used_adjp_deep_df = pd.DataFrame(frequently_used_adjp_deep)
frequently_used_adjp_deep_df.to_csv('../result/frequently_used_adjp_deep_extraction.csv')

# post processing of adjp from shallow extraction

In [161]:
result_dict_shallow_p = postprocessing_dict(result_dict_shallow)

dict size before processing: 29437
Number of changes: 86
dict size aftter processing: 27047


In [162]:
len(result_dict_shallow_p)

27047

In [211]:
sort_dict(result_dict_shallow_p)

{'so much': 148,
 'so many': 133,
 'much better': 107,
 'not sure': 81,
 'how much': 78,
 'very busy': 62,
 'worth it': 59,
 'as good': 57,
 'quite good': 53,
 'good too': 53,
 'really great': 53,
 'how many': 52,
 'pretty decent': 50,
 'well worth': 50,
 'highly recommended': 49,
 'very little': 44,
 'too busy': 44,
 'friendly and helpful': 44,
 'very small': 43,
 'super nice': 43,
 'great too': 38,
 'needless to say': 38,
 'too many': 38,
 'just okay': 38,
 'how good': 37,
 'really cool': 36,
 'even better': 33,
 'very sweet': 32,
 'fresh and delicious': 32,
 'very large': 30,
 'hot and sour': 30,
 'really bad': 29,
 'too long': 29,
 'very comfortable': 29,
 'really friendly': 28,
 'a little more': 28,
 'medium rare': 27,
 'really busy': 27,
 'consistently good': 27,
 'very accommodating': 27,
 'pretty tasty': 27,
 'pretty cool': 27,
 'fast and friendly': 26,
 'as much': 26,
 'most recent': 26,
 'so glad': 26,
 'so sweet': 26,
 'super busy': 25,
 'pretty bad': 24,
 'so much better': 

In [214]:
result_shallow_df = pd.DataFrame(result_dict_shallow_p, index=['frequency']).transpose()
result_shallow_df.to_csv('../result/frequency_result_shallow_extraction.csv')

# Get frequently-used ADJP - shallow extraction

In [163]:
def get_frequently_used_adjp(data_dict):
    frequent_count = get_count_for_top_n_percent(data_dict, 5)  # top 5 %
    common_adjp = [k for k, v in data_dict.items() if v >= frequent_count]
    print('Num adjp considered common: {}'.format(len(common_adjp)))
    return common_adjp

In [164]:
frequently_used_adjp_shallow = get_frequently_used_adjp(result_dict_shallow_p)

0.05
Number of adjp that are top 5% 1352
Count for top 5% 2
Num adjp considered common: 2652


In [219]:
frequently_used_adjp_shallow_df = pd.DataFrame(frequently_used_adjp_shallow)
frequently_used_adjp_shallow_df.to_csv('../result/frequently_used_adjp_shallow_extraction.csv')

# Get random biz & extract adjp from their reviews

In [46]:
# random_biz = get_random_business(df)

random_biz = 'VWb8gk_DKUCDKw3Xsdq8Jg'
all_reviews = get_all_reviews_from_biz(random_biz)
adjp_from_biz_shallow_parsing = process_plain_text(all_reviews, True)

Number of reviews: 100
reconnecting to standford core server ... 
Number of text 100
text id 0
text id 1
text id 2
text id 3
text id 4
text id 5
text id 6
text id 7
text id 8
text id 9
text id 10
text id 11
text id 12
text id 13
text id 14
text id 15
text id 16
text id 17
text id 18
text id 19
text id 20
text id 21
text id 22
text id 23
text id 24
text id 25
text id 26
text id 27
text id 28
text id 29
text id 30
text id 31
text id 32
text id 33
text id 34
text id 35
text id 36
text id 37
text id 38
text id 39
text id 40
text id 41
text id 42
text id 43
text id 44
text id 45
text id 46
text id 47
text id 48
text id 49
text id 50
text id 51
text id 52
text id 53
text id 54
text id 55
text id 56
text id 57
text id 58
text id 59
text id 60
text id 61
text id 62
text id 63
text id 64
text id 65
text id 66
text id 67
text id 68
text id 69
text id 70
text id 71
text id 72
text id 73
text id 74
text id 75
text id 76
text id 77
text id 78
text id 79
text id 80
text id 81
text id 82
text id 83
t

## Postprocessing

In [165]:
adjp_from_biz_shallow_parsing_p = postprocessing_list(adjp_from_biz_shallow_parsing)

list size before processing: 354
Number of changes: 354
Number of adj removed: 103
list size aftter processing: 251


# Compare with frequently-used adjp to get indicative adjp

### Deep extraction

In [166]:
indicative_adjp_sf_deep = get_indicative_adjp(frequently_used_adjp_deep, adjp_from_biz_shallow_parsing_p)

In [167]:
len(indicative_adjp_sf_deep)

174

In [168]:
indicative_adjp_sf_deep

['disappointed in my waffle',
 'so long for my waffle',
 'cold out of gate',
 'overly busy',
 'so excited for them to have a move convenient for me location',
 '4 year old',
 'weird with him',
 'disappointed with my experience',
 'plentiful with smiles on their faces even during a rush',
 'open at 8 am for breakfast brunch',
 'absolutely delicious with a nice kick of heat',
 'very slow',
 'light and flavorful',
 'so good',
 'slow to seat us despite there being many empty tables',
 'really one',
 'peppermint ok',
 'better than the service but not necessarily enough',
 'savory to sweet',
 'moldy and rotten',
 'very short with us and very miserable',
 'back to waffles inc',
 'just decent',
 'a bit softer than i like',
 'highly disappointed this time considering my first experience here was great',
 'pleasantly surprised',
 'nice sized',
 'so great',
 'a little small for my preference but the coffee itself is great',
 'fine in summer',
 'too much about the hospitality industry',
 'differen

### Shallow extraction

In [169]:
indicative_adjp_sf_shallow = get_indicative_adjp(frequently_used_adjp_shallow, adjp_from_biz_shallow_parsing_p)

In [170]:
indicative_adjp_sf_shallow

['disappointed in my waffle',
 'so long for my waffle',
 'cold out of gate',
 'overly busy',
 'so excited for them to have a move convenient for me location',
 '4 year old',
 'weird with him',
 'disappointed with my experience',
 'plentiful with smiles on their faces even during a rush',
 'open at 8 am for breakfast brunch',
 'absolutely delicious with a nice kick of heat',
 'very slow',
 'light and flavorful',
 'slow to seat us despite there being many empty tables',
 'really one',
 'peppermint ok',
 'better than the service but not necessarily enough',
 'savory to sweet',
 'moldy and rotten',
 'very short with us and very miserable',
 'back to waffles inc',
 'a bit softer than i like',
 'highly disappointed this time considering my first experience here was great',
 'pleasantly surprised',
 'nice sized',
 'so great',
 'a little small for my preference but the coffee itself is great',
 'fine in summer',
 'too much about the hospitality industry',
 'different than many',
 'watered down

In [171]:
len(indicative_adjp_sf_shallow)

152

## Difference between shallow and deep

In [172]:
in_shallow_not_in_deep, in_deep_not_in_shallow = Diff(indicative_adjp_sf_shallow, indicative_adjp_sf_deep)

In [173]:
in_shallow_not_in_deep

[]

In [174]:
in_deep_not_in_shallow

['pick up',
 'just decent',
 'able to order',
 'really poor',
 'so watery',
 'way too long',
 'quite satisfying',
 'a little runny in the middle',
 'how expensive',
 'thick and tasty',
 'too popular',
 'any more',
 'mediocre at best',
 'hard to decide what to get',
 'hard to decide',
 'thoroughly enjoyable',
 'friendly and informative',
 'so good',
 'patient with us',
 'slightly high',
 'very helpful and accommodating',
 'very well seasoned']

# Is adjective indicative?

Make sure that the phrases do not appear more than 2 times in 200 randomly extracted reviews - if the adjective phrase appears more than 2 times in the 200 reviews, it would be deemed as not indicative and considered as identification error as well.

In [191]:
def get_n_random_reviews_not_in_biz(df, biz, n):
    random_reviews = []
    df_without_biz = df.loc[df['business_id'] != biz]
    text = df_without_biz['text'].astype('str')
    text_list = text.tolist()
    for i in range(n):
        random_reviews.append(random.choice(text_list))
    return random_reviews

In [204]:
random_200_reviews = get_n_random_reviews_not_in_biz(df, random_biz, 200)

# join sentences in list tgt
random_200_reviews = " ".join(random_200_reviews)

random_1000_reviews = get_n_random_reviews_not_in_biz(df, random_biz, 1000)
random_1000_reviews = " ".join(random_1000_reviews)

random_2000_reviews = get_n_random_reviews_not_in_biz(df, random_biz, 2000)
random_2000_reviews = " ".join(random_2000_reviews)

### for deep extraction

In [193]:
def get_adjp_appear_more_than_n_times(random_reviews, adjp_list, n):
    adjp_appear_list = []
    for i in adjp_list:
        if len([m.start() for m in re.finditer(i, random_reviews)]) > n:
            adjp_appear_list.append(i)
    print('Number of adjp that appear more than n times: {}'.format(len(adjp_appear_list)))
    return adjp_appear_list

In [194]:
adjp_list_appear_more_deep = get_adjp_appear_more_than_n_times(random_200_reviews, indicative_adjp_sf_deep, 2)

Number of adjp that appear more than n times: 2


In [195]:
adjp_list_appear_more_deep

['so good', 'pretty good']

In [199]:
adjp_list_appear_more_deep = get_adjp_appear_more_than_n_times(random_1000_reviews, indicative_adjp_sf_deep, 2)

Number of adjp that appear more than n times: 11


In [200]:
adjp_list_appear_more_deep

['very slow',
 'so good',
 'pleasantly surprised',
 'so great',
 'so friendly',
 'just ok',
 'pretty good',
 'pick up',
 'able to order',
 'any more',
 'so hungry']

In [205]:
adjp_list_appear_more_deep = get_adjp_appear_more_than_n_times(random_2000_reviews, indicative_adjp_sf_deep, 2)

Number of adjp that appear more than n times: 13


In [206]:
adjp_list_appear_more_deep

['very slow',
 'so good',
 'pleasantly surprised',
 'so great',
 'so friendly',
 'just ok',
 'pretty good',
 'pick up',
 'able to order',
 'any more',
 'friendly and informative',
 'a little pricey',
 'so hungry']

### for shallow extraction

In [196]:
adjp_list_appear_more_shallow = get_adjp_appear_more_than_n_times(random_200_reviews, indicative_adjp_sf_shallow, 2)

Number of adjp that appear more than n times: 1


In [197]:
adjp_list_appear_more_shallow

['pretty good']

In [201]:
adjp_list_appear_more_shallow = get_adjp_appear_more_than_n_times(random_1000_reviews, indicative_adjp_sf_shallow, 2)

Number of adjp that appear more than n times: 7


In [203]:
adjp_list_appear_more_shallow

['very slow',
 'pleasantly surprised',
 'so great',
 'so friendly',
 'just ok',
 'pretty good',
 'so hungry']

In [207]:
adjp_list_appear_more_shallow = get_adjp_appear_more_than_n_times(random_2000_reviews, indicative_adjp_sf_shallow, 2)

Number of adjp that appear more than n times: 8


In [208]:
adjp_list_appear_more_shallow

['very slow',
 'pleasantly surprised',
 'so great',
 'so friendly',
 'just ok',
 'pretty good',
 'a little pricey',
 'so hungry']

# Output to csv file for manually anaylsis

### Indicative adjp

In [130]:
list_to_csv(indicative_adjp_sf_deep, '../result/deep_extraction_indicative_adjp_result.csv')
list_to_csv(indicative_adjp_sf_shallow, '../result/shallow_extraction_indicative_adjp_result.csv')

### Sentence in biz

In [182]:
# split into sentences for manual review & output to csv
reviews_from_biz_split = split_into_sentence(all_reviews, '../result/biz_review.csv')

0
10
20
30
40
50
60
70
80
90


In [183]:
# Using DataFrame.insert() to add a column
reviews_from_biz_split.insert(0, "Indicative", [False for i in range(len(reviews_from_biz_split))], True)
reviews_from_biz_split.insert(2, "ADJP - deep extraction", ['-' for i in range(len(reviews_from_biz_split))], True)
reviews_from_biz_split.insert(3, "ADJP - shallow extraction", ['-' for i in range(len(reviews_from_biz_split))], True)
reviews_from_biz_split.rename(columns={0: "Sentences in biz"}, inplace=True)
reviews_from_biz_split.head()

Unnamed: 0,Indicative,Sentences in biz,ADJP - deep extraction,ADJP - shallow extraction
0,False,Disappointed in my Waffle.,-,-
1,False,After my third cup of coffee my waffle arrived.,-,-
2,False,It was obviously sitting under some sort of wa...,-,-
3,False,I waited so long for my waffle I didn't have t...,-,-
4,False,I ate less than half and didn't want to eat an...,-,-


In [188]:
deep_indication_str = '\t'.join(indicative_adjp_sf_deep)
for index, row in reviews_from_biz_split.iterrows():
#     print(index)
    for adjp in indicative_adjp_sf_deep:
        if adjp in row["Sentences in biz"].lower():
#             reviews_from_biz_split.loc[index, 'TOC']
            reviews_from_biz_split.loc[index, 'ADJP - deep extraction'] = adjp
            reviews_from_biz_split.loc[index, 'Indicative'] = True
    for adjp in indicative_adjp_sf_shallow:
        if adjp in row["Sentences in biz"].lower():
#             print(adjp)
            reviews_from_biz_split.loc[index, 'ADJP - shallow extraction'] = adjp
            reviews_from_biz_split.loc[index, 'Indicative'] = True




In [189]:
reviews_from_biz_split.head()

Unnamed: 0,Indicative,Sentences in biz,ADJP - deep extraction,ADJP - shallow extraction
0,True,Disappointed in my Waffle.,disappointed in my waffle,disappointed in my waffle
1,False,After my third cup of coffee my waffle arrived.,-,-
2,False,It was obviously sitting under some sort of wa...,-,-
3,True,I waited so long for my waffle I didn't have t...,so long for my waffle,so long for my waffle
4,True,I ate less than half and didn't want to eat an...,any more,-


In [190]:
reviews_from_biz_split.to_csv('../result/business_adjp_analysis.csv')