## PRI Lab 1

general imports

In [1]:
import numpy as np, pandas as pd
import os, os.path
from collections import defaultdict

*sklearn*, *nltk* and *whoosh* imports

In [2]:
import sklearn
from sklearn.feature_extraction import text

In [3]:
from whoosh import index, fields, qparser, scoring, matching

In [4]:
import nltk, nltk.stem
from nltk.classify import Senna

Notes:
1) run these commands in case of error:<br>
nltk.download('punkt')<br>
nltk.download('averaged_perceptron_tagger')<br>
nltk.download('maxent_ne_chunker')<br>
nltk.download('words')<br>
nltk.download('wordnet')
2) download senna executable if necessary: https://github.com/baojie/senna/tree/master

### Exercise 1

In [5]:
def list_files(path):
    files = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        files += [os.path.join(dirpath,f) for f in filenames]
    return files

In [6]:
list_files('BBC News Summary\\Summaries')[0:2]

['BBC News Summary\\Summaries\\business\\001.txt',
 'BBC News Summary\\Summaries\\business\\002.txt']

In [7]:
with open('doc1.txt', 'r') as f: print(f.read())

British hurdler Sarah Claxton was confident she could win her first major medals at European indoor championships in Madrid. For the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form.


In [8]:
with open('doc2.txt', 'r') as f: print(f.read())

Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage. Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year. And at last week's Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot.


In [9]:
def token_counts_simple(filename):
    result = defaultdict(int)
    with open(filename, 'r') as f:
        for line in f:
            for token in line.split():
                result[token] += 1
    return result

In [10]:
lemmatizer = nltk.stem.WordNetLemmatizer() #optional

In [11]:
def token_counts_nltk(filename):
    result = defaultdict(int)
    with open(filename, 'r') as f:
        for sentence in nltk.sent_tokenize(f.read()):
            for token in nltk.word_tokenize(sentence):
                result[lemmatizer.lemmatize(token)] += 1
    return result

In [12]:
print(token_counts_simple('doc1.txt'))
print(token_counts_nltk('doc1.txt'))

defaultdict(<class 'int'>, {'British': 1, 'hurdler': 1, 'Sarah': 1, 'Claxton': 2, 'was': 1, 'confident': 1, 'she': 1, 'could': 2, 'win': 1, 'her': 2, 'first': 2, 'major': 1, 'medals': 1, 'at': 1, 'European': 1, 'indoor': 1, 'championships': 1, 'in': 2, 'Madrid.': 1, 'For': 1, 'the': 2, 'time,': 1, 'has': 1, 'only': 1, 'been': 1, 'preparing': 1, 'for': 1, 'a': 1, 'campaign': 1, 'over': 1, 'hurdles': 1, '-': 1, 'which': 1, 'explain': 1, 'leap': 1, 'form.': 1})
defaultdict(<class 'int'>, {'British': 1, 'hurdler': 1, 'Sarah': 1, 'Claxton': 2, 'wa': 1, 'confident': 1, 'she': 1, 'could': 2, 'win': 1, 'her': 2, 'first': 2, 'major': 1, 'medal': 1, 'at': 1, 'European': 1, 'indoor': 1, 'championship': 1, 'in': 2, 'Madrid': 1, '.': 2, 'For': 1, 'the': 2, 'time': 1, ',': 1, 'ha': 1, 'only': 1, 'been': 1, 'preparing': 1, 'for': 1, 'a': 1, 'campaign': 1, 'over': 1, 'hurdle': 1, '-': 1, 'which': 1, 'explain': 1, 'leap': 1, 'form': 1})


In [13]:
def count_syntactic_class(filename):
    result = defaultdict(int)
    with open(filename, 'r') as f:
        for sentence in nltk.sent_tokenize(f.read()):
            tokens = nltk.word_tokenize(sentence)
            for word, tag in nltk.pos_tag(tokens):
                result[tag] += 1
    return result

In [14]:
print(count_syntactic_class('doc1.txt'))

defaultdict(<class 'int'>, {'JJ': 6, 'NN': 6, 'NNP': 4, 'VBD': 1, 'PRP': 1, 'MD': 2, 'VB': 2, 'PRP$': 2, 'NNS': 3, 'IN': 6, '.': 2, 'DT': 3, ',': 1, 'VBZ': 1, 'RB': 1, 'VBN': 1, 'VBG': 1, ':': 1, 'WDT': 1})


In [15]:
def senna(filename):
    res = []
    pipeline = Senna('senna-master', ['pos','chk','ner'])
    with open(filename, 'r') as f:
        for line in f:
            for tags in pipeline.tag(line.split()):
                res.append((tags['word'], tags['chk'], tags['ner'], tags['pos']))
    return res

In [16]:
print(senna('doc1.txt'))

[('British', 'B-NP', 'B-MISC', 'JJ'), ('hurdler', 'I-NP', 'O', 'NN'), ('Sarah', 'I-NP', 'B-PER', 'NNP'), ('Claxton', 'I-NP', 'I-PER', 'NNP'), ('was', 'B-VP', 'O', 'VBD'), ('confident', 'B-ADJP', 'O', 'JJ'), ('she', 'B-NP', 'O', 'PRP'), ('could', 'B-VP', 'O', 'MD'), ('win', 'I-VP', 'O', 'VB'), ('her', 'B-NP', 'O', 'PRP$'), ('first', 'I-NP', 'O', 'JJ'), ('major', 'I-NP', 'O', 'JJ'), ('medals', 'I-NP', 'O', 'NNS'), ('at', 'B-PP', 'O', 'IN'), ('European', 'B-NP', 'B-MISC', 'JJ'), ('indoor', 'I-NP', 'O', 'JJ'), ('championships', 'I-NP', 'O', 'NNS'), ('in', 'B-PP', 'O', 'IN'), ('Madrid.', 'B-NP', 'B-MISC', 'NNP'), ('For', 'B-PP', 'I-MISC', 'IN'), ('the', 'B-NP', 'O', 'DT'), ('first', 'I-NP', 'O', 'JJ'), ('time,', 'I-NP', 'O', 'NN'), ('Claxton', 'I-NP', 'B-ORG', 'NNP'), ('has', 'B-VP', 'O', 'VBZ'), ('only', 'I-VP', 'O', 'RB'), ('been', 'I-VP', 'O', 'VBN'), ('preparing', 'I-VP', 'O', 'VBG'), ('for', 'B-PP', 'O', 'IN'), ('a', 'B-NP', 'O', 'DT'), ('campaign', 'I-NP', 'O', 'NN'), ('over', 'B-PP',

## Exercise 2

In [17]:
def sklearn_vectorizer(filenames, tfidf=False):
    documents = []
    vectorizer = text.TfidfVectorizer() if tfidf else text.CountVectorizer()
    for filename in filenames:
        with open(filename, 'r') as f: documents.append(f.read())
    data = vectorizer.fit_transform(documents)
    features = vectorizer.get_feature_names_out()
    return data, features

In [18]:
data, features = sklearn_vectorizer(['doc1.txt','doc2.txt'],False)
pd.DataFrame(np.array(data.toarray()),columns=features)

Unnamed: 0,60m,and,at,athlete,been,birmingham,born,british,but,campaign,...,trailing,translate,was,week,which,win,won,world,year,years
0,0,0,1,0,1,0,0,1,0,1,...,0,0,1,0,1,1,0,0,0,0
1,1,1,1,1,0,1,1,0,1,0,...,1,1,0,1,0,0,1,1,1,1


In [19]:
data, features = sklearn_vectorizer(['doc1.txt','doc2.txt'],True)
pd.DataFrame(np.array(data.toarray()),columns=features)

Unnamed: 0,60m,and,at,athlete,been,birmingham,born,british,but,campaign,...,trailing,translate,was,week,which,win,won,world,year,years
0,0.0,0.0,0.11024,0.0,0.154939,0.0,0.0,0.154939,0.0,0.154939,...,0.0,0.0,0.154939,0.0,0.154939,0.154939,0.0,0.0,0.0,0.0
1,0.118397,0.118397,0.084241,0.118397,0.0,0.118397,0.118397,0.0,0.118397,0.0,...,0.118397,0.118397,0.0,0.118397,0.0,0.0,0.118397,0.118397,0.118397,0.118397


In [20]:
sklearn.metrics.pairwise_distances(data, metric="cosine")

array([[0.        , 0.72139823],
       [0.72139823, 0.        ]])

## Exercise 3

**IMPT**: the most flexible way of answering the PRI project is by implementing an inverted index

In [22]:
def inverted_index(filenames):
    index = defaultdict(list)
    for i, file in enumerate(filenames):
        token_counts = token_counts_nltk(file)
        for word in token_counts:
            index[word].append((i,token_counts[word]))
    return index

In [23]:
myindex = inverted_index(['doc1.txt','doc2.txt'])
print(myindex)

defaultdict(<class 'list'>, {'British': [(0, 1)], 'hurdler': [(0, 1)], 'Sarah': [(0, 1)], 'Claxton': [(0, 2), (1, 2)], 'wa': [(0, 1)], 'confident': [(0, 1)], 'she': [(0, 1)], 'could': [(0, 2)], 'win': [(0, 1)], 'her': [(0, 2), (1, 1)], 'first': [(0, 2)], 'major': [(0, 1)], 'medal': [(0, 1), (1, 1)], 'at': [(0, 1), (1, 1)], 'European': [(0, 1), (1, 1)], 'indoor': [(0, 1)], 'championship': [(0, 1)], 'in': [(0, 2), (1, 2)], 'Madrid': [(0, 1)], '.': [(0, 2), (1, 3)], 'For': [(0, 1)], 'the': [(0, 2), (1, 6)], 'time': [(0, 1), (1, 1)], ',': [(0, 1), (1, 2)], 'ha': [(0, 1), (1, 2)], 'only': [(0, 1)], 'been': [(0, 1)], 'preparing': [(0, 1)], 'for': [(0, 1), (1, 1)], 'a': [(0, 1)], 'campaign': [(0, 1)], 'over': [(0, 1)], 'hurdle': [(0, 1), (1, 1)], '-': [(0, 1)], 'which': [(0, 1)], 'explain': [(0, 1)], 'leap': [(0, 1)], 'form': [(0, 1)], 'won': [(1, 1)], 'national': [(1, 1)], '60m': [(1, 1)], 'title': [(1, 1)], 'past': [(1, 1)], 'three': [(1, 1)], 'year': [(1, 2)], 'but': [(1, 1)], 'struggled':

## Exercises 4-5

In [27]:
def boolean_model(terms, index, docs=set()):
    if len(terms)==0: return docs
    term = lemmatizer.lemmatize(terms.pop(0))
    if len(docs)==0:
        for doc in index[term]: docs.add(doc[0])
        return boolean_model(terms, index, docs)
    result = set()
    for doc in index[term]:
        if doc[0] in docs: result.add(doc[0])
    return boolean_model(terms, index, result)

In [28]:
print(boolean_model(['medal'], myindex, set()))
print(boolean_model(['medal','spot'], myindex, set()))

{0, 1}
{1}


In [29]:
def TF_model(terms, index):
    result = defaultdict(int)
    for term in terms:
        for doc in index[term]:
            result[doc[0]]+=1
    return result

In [30]:
print(TF_model(['medal','spot'], myindex))

defaultdict(<class 'int'>, {0: 1, 1: 2})


## Exercise 6

**IMPT**: you can use whoosh for your PRI project, yet keep in mind that the API may limit your degree of freedom.<br>You can rewrite sources, such as *whoosh.scoring.py*, if you need to customize your vector spaces.

In [31]:
def whoosh_indexing(filenames):
    if not os.path.exists('index_ir'): os.mkdir('index_ir')
    schema = fields.Schema(id=fields.NUMERIC(stored=True), content=fields.TEXT)
    ix = index.create_in("index_ir", schema)
    writer = ix.writer()
    for identifier, file in enumerate(filenames):
        with open(file, 'r') as f:
            writer.add_document(id=identifier, content=f.read())
    writer.commit()
    return ix

In [32]:
ix = whoosh_indexing(['doc1.txt','doc2.txt'])

In [33]:
with ix.searcher().reader() as ixreader:
    print('#docs:', ixreader.doc_count_all())
    print('CF(has):', ixreader.frequency(fieldname='content',text='has'))
    print('DF(has):', ixreader.doc_frequency(fieldname='content',text='has'))
    for posting in ixreader.postings("content", "has").items_as('frequency'):
        print('TF_{has,'+str(posting[0])+'}:',posting[1])

#docs: 2
CF(has): 3.0
DF(has): 2
TF_{has,0}: 1
TF_{has,1}: 2


In [34]:
def whosh_query(query):
    ix = index.open_dir("index_ir")
    with ix.searcher() as searcher:
        q = qparser.QueryParser("content", ix.schema, group=qparser.OrGroup).parse(query)
        return searcher.search(q)

In [35]:
results = whosh_query("Claxton")
for doc, score in results.items():
    print("ID:",doc," score:",score)

ID: 0  score: 0.85080415159161
ID: 1  score: 0.776043291106279
