## Sample MAT Tags

Run sample text through the MAT tagger and find examples for testing our implementation. Built quick datasets for validation purposes

In [35]:
import re
import stanza
import numpy as np
import pandas as pd

from tqdm import tqdm

pipeline = stanza.Pipeline(lang='en', processors='tokenize,pos', use_gpu=False)

2023-02-10 22:32:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-10 22:32:58 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-02-10 22:32:58 INFO: Use device: cpu
2023-02-10 22:32:58 INFO: Loading: tokenize
2023-02-10 22:32:58 INFO: Loading: pos
2023-02-10 22:32:59 INFO: Done loading processors!


### Brown Text Tag Samples

In [64]:
fp = '/Users/kalkiek/Documents/MDA Tagger/MAT_MDA Tagger/brown_MAT.txt'

with open(fp, 'r') as f:
    text = f.read()
    lines = text.split('\n')
    print(len(lines))

1181225


In [44]:
s1 = "stocking_NOMZ [WZPRES] [TSUB]"
s2 = "stocking_VBG"

In [48]:
parts = s1.split(' ')
print(parts)
token, pos_tag = parts[0].split('_')
if len(parts) > 1:
    print(set(re.findall('\[(.*?)\]', s1)))

print(token, pos_tag)

if pos_tag not in pos_tags:
    print(pos_tag)

['stocking_NOMZ', '[WZPRES]', '[TSUB]']
{'TSUB', 'WZPRES'}
stocking NOMZ
NOMZ


In [68]:
pos_tags = ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','SYM','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP$','WRB']


def parse_token_and_tags(line):
    all_tags = set()
    parts = line.split(' ')
    token, pos_tag = parts[0].split('_')
        
    if pos_tag not in pos_tags and len(pos_tag) >= 3:
        all_tags.add(pos_tag)
        
    if len(parts) > 1:
        tags = set(re.findall('\[(.*?)\]', line))
        all_tags.update(tags)
    
    return token, all_tags
    
    
def get_surrounding_context(lines, i, n=5):
    total_lines = len(lines)
    
    if i - n < 0:
        before = 0
    else:
        before = i - n
        
    if i + n == total_lines:
        after = total_lines - 1
    else:
        after = i + n
    
    return lines[before:after]


def parse_sentence_from_context(context_lines):
    tokens = []
    for context_line in context_lines:
        tokens.append(parse_token(context_line))
    return " ".join(tokens)


def get_all_unique_tags(lines):
    all_tags = set()
    for line in tqdm(lines):
        try:
            token, tags = parse_token_and_tags(line)
            if tags:
                all_tags.update(tags)
        except Exception:
            pass
    return all_tags

In [69]:
unique_tags = get_all_unique_tags(lines)
len(unique_tags)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1181225/1181225 [00:00<00:00, 1326938.84it/s]


65

In [70]:
unique_tags

{'-LRB-',
 '-RRB-',
 'AMP',
 'ANDC',
 'BEMA',
 'BYPA',
 'CAUS',
 'CONC',
 'COND',
 'CONJ',
 'CONT',
 'DEMO',
 'DEMP',
 'DPAR',
 'DWNT',
 'EMPH',
 'FPP1',
 'GER',
 'HDG',
 'INPR',
 'NEMD',
 'NOMZ',
 'NULL',
 'OSUB',
 'PASS',
 'PASTP',
 'PEAS',
 'PHC',
 'PIN',
 'PIRE',
 'PIT',
 'PLACE',
 'POMD',
 'PRED',
 'PRESP',
 'PRIV',
 'PRMD',
 'PROD',
 'PUBV',
 'QUAN',
 'QUPR',
 'SERE',
 'SMP',
 'SPAU',
 'SPIN',
 'SPP2',
 'STPR',
 'SUAV',
 'SYNE',
 'THAC',
 'THATD',
 'THVC',
 'TIME',
 'TOBJ',
 'TPP3',
 'TSUB',
 'VPRT',
 'WHCL',
 'WHOBJ',
 'WHQU',
 'WHSUB',
 'WPS',
 'WZPAST',
 'WZPRES',
 'XX0'}

In [71]:
rows = []

for i, line in enumerate(lines):
    try:
        token, tags = parse_token_and_tags(line)
        if tags:
            context_lines = get_surrounding_context(lines, i, n=10)
            sentence = parse_sentence_from_context(context_lines)
            for tag in tags:
                rows.append({
                    'keyword': token,
                    'tag': tag,
                    'sentence': sentence
                })
    except Exception:
        pass
        
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,keyword,tag,sentence
0,said,PUBV,The Fulton County Grand Jury said Friday an in...
1,investigation,NOMZ,The Fulton County Grand Jury said Friday an in...
2,of,PIN,The Fulton County Grand Jury said Friday an in...
3,election,NOMZ,Jury said Friday an investigation of Atlanta '...
4,no,SYNE,an investigation of Atlanta 's recent primary ...


In [72]:
len(df)

451399

**Select samples for each tag**

In [73]:
df.to_csv('brown_tags.tsv', sep='\t', index=False)

In [74]:
sample_df = df.groupby("tag").sample(n=25, random_state=1)
sample_df.head()

Unnamed: 0,keyword,tag,sentence
86776,-LRB-,-LRB-,was he who turned the attention of William Llo...
230118,-LRB-,-LRB-,", And with never a glance at the sky . -LRB- O..."
59588,-LRB-,-LRB-,Sutherland first sang Lucia at Covent Garden i...
112060,-LRB-,-LRB-,. As an aid in the prevention of bacterial dia...
51503,-LRB-,-LRB-,"hand : <The Nation> , Walter Lippmann and othe..."


In [75]:
sample_df.to_csv('brown_sample_tags.tsv', sep='\t', index=False)

### Amazon Reviews Sample Text

In [None]:
# First time setup
fp = '/Users/kalkiek/Downloads/amazon_review_full_csv/test.csv'
out = '/Users/kalkiek/Downloads/amazon_review_full_csv/amazon_test.txt'

amazon_df = pd.read_csv(fp, names=['title', 'text'])

with open(out, 'w') as f:
    for index, row in amazon_df.iterrows():
        f.write(row['text'] + '\n')

In [76]:
fp = '/Users/kalkiek/Documents/MDA Tagger/MAT_MDA Tagger/amazon_test_MAT.txt'

with open(fp, 'r') as f:
    text = f.read()
    lines = text.split('\n')
    print(len(lines))

57045406


In [78]:
rows = []

for i, line in tqdm(enumerate(lines)):
    try:
        token, tags = parse_token_and_tags(line)
        if tags:
            context_lines = get_surrounding_context(lines, i, n=10)
            sentence = parse_sentence_from_context(context_lines)
            for tag in tags:
                rows.append({
                    'keyword': token,
                    'tag': tag,
                    'sentence': sentence
                })
    except Exception:
        pass
        
df = pd.DataFrame(rows)
df.head()

57045406it [02:30, 378073.51it/s]


Unnamed: 0,keyword,tag,sentence
0,This,DEMO,"This model may be ok for sedentary types , but"
1,may,POMD,"This model may be ok for sedentary types , but..."
2,be,PASS,"This model may be ok for sedentary types , but..."
3,for,PIN,"This model may be ok for sedentary types , but..."
4,I,FPP1,"This model may be ok for sedentary types , but..."


In [79]:
df['tag'].value_counts()

PIN      4184355
VPRT     3446983
FPP1     2094875
BEMA     1410347
PIT      1244208
          ...   
WHQU       18300
SPIN       12731
PIRE       10737
WHOBJ       8836
WPS         2319
Name: tag, Length: 65, dtype: int64

In [80]:
print(len(df))
print(len(df['tag'].unique()))

25731593
65


In [81]:
df.to_csv('amazon_tags.tsv', sep='\t', index=False)

In [82]:
sample_df = df.groupby("tag").sample(n=25, random_state=1)
sample_df.head()

Unnamed: 0,keyword,tag,sentence
4019219,-LRB-,-LRB-,Ties takes numerous nails . It was a lifesaver...
14368694,-LRB-,-LRB-,The DHCP Client Tables would not update and my...
16629196,-LRB-,-LRB-,8:07 -RRB- 2 . Your love is taking me over -LR...
8073761,-LRB-,-LRB-,"the trip , the maps look like what we need -LR..."
24299634,-LRB-,-LRB-,"a religious person , and I 've even gotten int..."


In [83]:
sample_df.to_csv('amazon_sample_tags.tsv', sep='\t', index=False)