# Exercise 2 of Week 1 Assignement - Construct the binary incidence matrix
Construct the binary incidence matrix using the features extracted from the corpus.
The corpus (271 text documents) is available at
https://github.com/Ramaseshanr/anlp/blob/master/corpus/phy_corpus.txt.
It contains contains questions from Kinematics class of physics problems
sourced from the Internet
- In this assignment, you need to develop a python program that uses
the knowledge related to Kinematics and build a table similar to the
one shown below for all the documents in the corpus.
- The program should be able to read each problem, capture the known
values (such as speed=10m/s, time=5s) and fill the respective cells
in the table. For example, if you find 10 m/s for document 1, fill the
speed with value row for D1 as 1.
- Please note that problems may or may not contain all nine terms
listed.
- The corpus may contain duplicate entries
- You may use any NLTK or any equivalent APIs for this assignment

| Terms                   | D<sub>1<sub> | D<sub>2<sub> | ... | D<sub>271<sub> |
|-------------------------|----|----|-----|------|
| Speed with value        | 1  | 0  | ... | 0    |
| Distance with value     | 0  | 0  | ... | 1    |
| Acceleration with value | 0  | 0  | ... | 0    |
| Time with value         | 0  | 1  | ... | 0    |

In [1]:
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import RegexpTokenizer # to get Kinematics
from nltk.tokenize import sent_tokenize # to find number of sentences in this corpus
from nltk.tokenize import word_tokenize
import pandas as pd # for table creation

## Download the corpus

In [2]:
url = 'https://raw.githubusercontent.com/Ramaseshanr/anlp/master/corpus/phy_corpus.txt'

# Download the corpus as text
html = urllib.request.urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html)
raw = BeautifulSoup(html, 'html.parser').get_text()
raw[:20]

# WRITE IT TO THE FILE if required
#with open("text.txt", "w", encoding="utf-8") as file:
#    file.write(raw)

'An airplane accelera'

## Preprocess the corpus
- Each Paragraph is considered as a separate document

In [3]:
# Extract paragraphs
# Since text has indices in front of the lines, we can't use sent_tokenize()
# So splitting teh corpus into paragraphs and then I will extract kinematics from each paragraph
paragraphs = [para for para in raw.split('\n') if para]

In [4]:
# Ensure we have right number of paragraphs that is found in the corpus
len(paragraphs)

271

## Find the various kind of units found in the given Corpus

In [5]:
# Create RegEx Tokenizer which extracts the required kinematics
# Looking for 'x.xx letter' | 'x.xx word' | 'x.xx word\word'
#regex_tokenizer = RegexpTokenizer('\d+\.\d* \w |\d+\.\d* [a-z]* |\d+\.\d* [a-z]*/[a-z][0-9]* |\w\.\d+ [a-z]*')
#pattern1 = '\d+.\d+\s[a-zA-Z]*\/[a-zA-Z]*[\d]*' # x{anything}xx X/X
#pattern1 = '\d+.[\da-zA-Z]+\s[a-zA-Z]*\/[a-zA-Z]*[\d]*' # x{anything}xx X/X
pattern1 = '\d+.[\da-zA-Z]+\s[a-zA-Z]*[\/a-zA-Z\d\^\-]{1,}' # x{anything}xx X/X
pattern2 = '\d+\s[a-zA-Z]*\/[a-zA-Z]*[\d]*' # x X/X
pattern3 = '\d+.\d+\s[a-zA-Z]+' # x{anything}xx X
pattern4 = '\d+\s[a-zA-Z]+' # x XX
pattern5 = '\.\d+\s[a-zA-Z]+' # .x XX
regex_tokenizer = RegexpTokenizer(pattern1 + '|' + pattern2 + '|' + pattern3 + '|' + pattern4 + '|' + pattern5)
para_index = 0
total_tokens = 0
para_tokens = []
final_tokens = []
for paragraph in paragraphs:
    regex_tokens = regex_tokenizer.tokenize(paragraph)
    para_tokens.insert(para_index, [])
    if len(regex_tokens):
        final_tokens.extend(regex_tokens)
        para_tokens[para_index].extend(regex_tokens)
        print(para_index+1, regex_tokens)
        total_tokens += len(regex_tokens)
    para_index = para_index + 1
print('Total tokens found: ', total_tokens)
#print(final_tokens)

1 ['3.20 m/s2', '32.8 s']
2 ['5.21 seconds', '110 m']
3 ['2.60 seconds']
4 ['18.5 m/s', '46.1 m/s', '2.47 seconds']
5 ['1.40 meters', '1.67 m/s2']
6 ['444 m/s', '1.83 seconds']
7 ['7.10 m/s', '35.4 m']
8 ['3 m/s2', '65 m/s']
9 ['22.4 m/s', '2.55 s']
10 ['2.62 m']
11 ['1.29 m']
12 ['521 m/s', '0.840 m']
13 ['6.25 s']
14 ['370 m above']
15 ['367 m/s', '0.0621 m']
16 ['3.41 s']
17 ['290 m in', '3.90 m/s2']
18 ['88.3 m/s', '1365 m to']
19 ['112 m/s', '398 m']
20 ['1 m/s', '2.23 mi/hr', '91.5 m']
21 ['0.5 km', '1 hour later']
22 ['12 m/sec', '36 seconds']
23 ['2 m/s', '12 s he/she']
24 ['50 km traveling', '10 km/hr']
25 ['12 m/s', '3.00 minutes']
26 ['25 min at', '12 m/s']
27 ['3250 m/s', '10 m/s2', '215 km']
28 ['0.6 m/s2', '55 mi/h', '60 mi/h']
29 ['23.7 km/h', '0.92 m/s2', '3.6 s']
30 ['30 degree hill', '3.30 m/s2', '110 m']
31 ['48 m/s', '12 m/s', '5 s']
32 ['24 m/s', '315 m']
33 ['50 km/hr', '90 km/hr', '15 seconds']
34 ['9000 meters in', '12.12 seconds']
35 ['528 meters in', '4 second

In [6]:
def is_number_repl_dot_isdigit(s):
    '''
    replaces '.' with '' in s and
    returns the result of isdigit(s)
    https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float
    '''
    return s.replace('.', '', 1).isdigit()

In [7]:
def get_tokens(final_tokens):
    '''
    Accepts list having strings
    splits into tokens separated by spaces
    and returns it as a set
    '''
    ll = []
    [ll.extend(token.split(' ')) for token in final_tokens]
    return set(ll)

In [8]:
final_tokens = get_tokens(final_tokens)
final_tokens = [s for s in final_tokens if not is_number_repl_dot_isdigit(s)]
sorted(final_tokens)

['0)above',
 '00-kg',
 '100-m',
 '100-mile',
 '10^14',
 '10^6',
 '2nd',
 '3,000',
 '3.0E8',
 '40,075',
 '5th',
 '7e6',
 '8-km',
 'Columbia',
 'Earth',
 'N',
 'Peachtree',
 'U',
 'Wichita',
 'above',
 'after',
 'ahead',
 'and',
 'angle',
 'at',
 'away',
 'before',
 'cm',
 'dash',
 'days',
 'degree',
 'degrees',
 'down',
 'drop',
 'due',
 'east',
 'feet',
 'foot',
 'for',
 'from',
 'h',
 'he/she',
 'high',
 'hill',
 'hour',
 'hours',
 'if',
 'in',
 'instead',
 'is',
 'it',
 'its',
 'kg',
 'kilograms',
 'km',
 'km/h',
 'km/hr',
 'km/s',
 'kmh',
 'later',
 'long',
 'm',
 'm/s',
 'm/s/s',
 'm/s2',
 'm/s^2',
 'm/sec',
 'meter',
 'meters',
 'meters/second',
 'metersvper',
 'mi',
 'mi/h',
 'mi/hr',
 'mile',
 'mile/minute',
 'miles',
 'min',
 'minutes',
 'more',
 'mph',
 'ms^-1',
 'near',
 'of',
 'off',
 'race',
 'reaches',
 'rest',
 'rock',
 'running',
 's',
 'sec',
 'second',
 'seconds',
 'she',
 'straight',
 't',
 'tall',
 'the',
 'then',
 'threw',
 'to',
 'towns',
 'traveling',
 'trip',
 'u

## Create the untils table based on above finding

In [9]:
speed_with_val = ['minutes', 'min', 'mi/hr', 'mi/h', 'mile/minute', 'metersvper', 'meters/second', 'm/sec', 'm/s', 'ms^-1', 'mph']
distance_with_val = ['miles', 'mile', 'meters', 'meter','m', 'feet', '-km', '-mile', '-m', 'km']
acceleration_with_val = ['km/h', 'km/hr', 'km/s', 'kmh', 'm/s2', 'm/s^2', 'm/s/s' ]
time_with_val = ['hours', 'hour', 'second', 'seconds', 'sec', 's', 'times', 'h']

## Create the table and fill it

In [10]:
column_header = ['Speed with value', 'Distance with value', 'Acceleration with value', 'Time with value']
df = pd.DataFrame(columns=column_header)

In [11]:
# loop through each paragraph tokens
para_index = 0
for tokens in para_tokens:
    tt = get_tokens(tokens)
    #print(tt)
    # loop through each token
    c1 = 0
    c2 = 0
    c3 = 0
    c4 = 0
    if any(s in tt for s in speed_with_val):
        c1 = 1
    if any(s in tt for s in distance_with_val):
        c2 = 1
    if any(s in tt for s in acceleration_with_val):
        c3 = 1
    if any(s in tt for s in time_with_val):
        c4 = 1
    #print(para_index+1, c1, c2, c3, c4)
    df = df.append(pd.Series([c1, c2, c3, c4], index=df.columns), ignore_index=True)
    para_index += 1

In [12]:
df

Unnamed: 0,Speed with value,Distance with value,Acceleration with value,Time with value
0,0,0,1,1
1,0,1,0,1
2,0,0,0,1
3,1,0,0,1
4,0,1,1,0
5,1,0,0,1
6,1,1,0,0
7,1,0,1,0
8,1,0,0,1
9,0,1,0,0


In [13]:
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,261,262,263,264,265,266,267,268,269,270
Speed with value,0,0,0,1,0,1,1,1,1,0,...,0,0,1,0,1,1,1,1,1,1
Distance with value,0,1,0,0,1,0,1,0,0,1,...,1,1,0,1,0,0,1,1,0,0
Acceleration with value,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
Time with value,1,1,1,1,0,1,0,0,1,0,...,1,0,1,0,0,0,0,0,1,1
