# Search Project for CST 495

> CMU Movie Summary Corpus
http://www.cs.cmu.edu/~ark/personas/

We will be using Spark, so the first step is to ensure we have installed the module.

In [1]:
! pip install findspark

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# First we specify the path to spark
import findspark
import os
findspark.init(os.getenv('HOME') + '/spark-1.6.0-bin-hadoop2.6')
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'

# Now we can import pyspark and get the spark context
# - spark context is the entry point to spark for a spark application
import pyspark
try: 
    print(sc)
except NameError:
    sc = pyspark.SparkContext()
    print(sc)

<pyspark.context.SparkContext object at 0x10b95f390>


# Resilient Distributed Dataset (RDD)

From the Spark documentation:

_"A Resilient Distributed Dataset (RDD), the basic abstraction in Spark, represents an immutable, partitioned collection of elements that can be operated on in parallel."_

_"Parallelized collections are created by calling SparkContext’s parallelize method on an existing iterable or collection in your driver program. The elements of the collection are copied to form a distributed dataset that can be operated on in parallel."_ 

In [3]:
# creating an RDD

rdd = sc.textFile(os.getcwd()+'/data/MovieSummaries/plot_summaries.txt')
print(rdd)

rdd.take(3)

MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:-2


[u"23890098\tShlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",
 u'31186339\tThe nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl  between the ages of 12 and 18 selected by lottery  for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker\'s son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academ

> **Counting words**

In [4]:
words_per_line = rdd.map(lambda s: len(s.split())).filter(lambda x : x > 2)

total_words = words_per_line.reduce(lambda x,y : x+y)

print(total_words)

13187557


> Term Frequency

In [5]:
# use Spark API for optimization
import re

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())
    #return re.compile(r'\b[a-zA-Z]+\b').split(text.lower())

def toCSVLine(data):
    return ','.join(str(d) for d in data)

rdd = sc.textFile(os.getcwd()+'/data/MovieSummaries/plot_summaries.txt')
rdd.cache()
#words = rdd.flatMap(lambda x: x.split())
words = rdd.flatMap(normalizeWords)
wordCounts = words.countByValue()

wordCounts = words.map(lambda x: (x,1)).reduceByKey(lambda x, y: x + y)
wordCountsSorted = wordCounts.map(lambda (x,y) : (y,x)).sortByKey()
results = wordCountsSorted.collect()

import csv
with open(os.getcwd() + '/data/MovieSummaries/plot_sum.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ')#,
                            #quotechar=' ', quoting=csv.QUOTE_MINIMAL)

    for result in results:
        count = str(result[0])
        word = result[1].encode('ascii', 'ignore')
        
        if(word and int(count)>10000): # (word.isdigit()):# and int(count)<2):
                print word + ":\t\t" + count

                #limit csv filr for now
                spamwriter.writerow([count] + [","] + [word])
          

         
            

if:		10028
because:		10041
friends:		10043
men:		10086
make:		10134
next:		10154
killed:		10181
daughter:		10181
just:		10203
both:		10214
becomes:		10333
money:		10351
woman:		10359
death:		10562
begins:		10564
way:		10683
story:		10770
some:		10812
friend:		10844
no:		11190
what:		11317
more:		11359
old:		11428
night:		11863
son:		11982
now:		12038
tries:		12076
help:		12091
gets:		12210
during:		12351
first:		12642
take:		12676
through:		12775
wife:		12903
young:		12992
away:		13114
down:		13489
himself:		13534
police:		13607
there:		13920
takes:		13941
t:		13987
day:		14085
goes:		14219
go:		14395
over:		14967
mother:		15452
can:		15479
being:		15507
so:		15623
later:		15624
finds:		15738
however:		16051
find:		16141
house:		16180
also:		16376
family:		16573
home:		16950
before:		16971
been:		17044
get:		17211
only:		17348
new:		17374
other:		17377
love:		17449
life:		17823
off:		18238
time:		18280
had:		18321
tells:		19108
man:		19941
film:		21433
father:		21512
will:		21969
two:	

> Testing the Spark DataFrames API with the Data

In [6]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false', inferSchema='true').load(os.getcwd() 
        + '/data/MovieSummaries/plot_sum.csv').selectExpr("C0 as id","C1 as words")

df.show()

+-------+---------+
|     id|    words|
+-------+---------+
|10028.0|       if|
|10041.0|  because|
|10043.0|  friends|
|10086.0|      men|
|10134.0|     make|
|10154.0|     next|
|10181.0|   killed|
|10181.0| daughter|
|10203.0|     just|
|10214.0|     both|
|10333.0|  becomes|
|10351.0|    money|
|10359.0|    woman|
|10562.0|    death|
|10564.0|   begins|
|10683.0|      way|
|10770.0|    story|
|10812.0|     some|
|10844.0|   friend|
|11190.0|       no|
+-------+---------+
only showing top 20 rows



In [7]:
df.schema

StructType(List(StructField(id,DoubleType,true),StructField(words,StringType,true)))

In [8]:
sqlContext.registerDataFrameAsTable(df,'plotTerms')
sqlContext.tableNames()

sqlContext.sql("select * from plotTerms order by id limit 20").show()

+-------+---------+
|     id|    words|
+-------+---------+
|10028.0|       if|
|10041.0|  because|
|10043.0|  friends|
|10086.0|      men|
|10134.0|     make|
|10154.0|     next|
|10181.0|   killed|
|10181.0| daughter|
|10203.0|     just|
|10214.0|     both|
|10333.0|  becomes|
|10351.0|    money|
|10359.0|    woman|
|10562.0|    death|
|10564.0|   begins|
|10683.0|      way|
|10770.0|    story|
|10812.0|     some|
|10844.0|   friend|
|11190.0|       no|
+-------+---------+



In [9]:
sqlContext.tableNames()

[u'plotTerms']

In [10]:
df.take(10)

[Row(id=10028.0, words=u' if'),
 Row(id=10041.0, words=u' because'),
 Row(id=10043.0, words=u' friends'),
 Row(id=10086.0, words=u' men'),
 Row(id=10134.0, words=u' make'),
 Row(id=10154.0, words=u' next'),
 Row(id=10181.0, words=u' killed'),
 Row(id=10181.0, words=u' daughter'),
 Row(id=10203.0, words=u' just'),
 Row(id=10214.0, words=u' both')]

> **Inverted Index**

In [11]:
index = df.flatMap(lambda row : [ ( word, row[0]) for word in row[1].strip().split(' ') ] ) 
index.take(20)

[(u'if', 10028.0),
 (u'because', 10041.0),
 (u'friends', 10043.0),
 (u'men', 10086.0),
 (u'make', 10134.0),
 (u'next', 10154.0),
 (u'killed', 10181.0),
 (u'daughter', 10181.0),
 (u'just', 10203.0),
 (u'both', 10214.0),
 (u'becomes', 10333.0),
 (u'money', 10351.0),
 (u'woman', 10359.0),
 (u'death', 10562.0),
 (u'begins', 10564.0),
 (u'way', 10683.0),
 (u'story', 10770.0),
 (u'some', 10812.0),
 (u'friend', 10844.0),
 (u'no', 11190.0)]

In [12]:
index = df.flatMap(lambda row : [ (word,  row[0]) for word in row[1].split(' ') ] ).groupByKey()
index.take(10)

[(u'', <pyspark.resultiterable.ResultIterable at 0x11553f890>),
 (u'and', <pyspark.resultiterable.ResultIterable at 0x11553fa50>),
 (u'all', <pyspark.resultiterable.ResultIterable at 0x11553fa90>),
 (u'old', <pyspark.resultiterable.ResultIterable at 0x11553fad0>),
 (u'family', <pyspark.resultiterable.ResultIterable at 0x11553fb10>),
 (u'being', <pyspark.resultiterable.ResultIterable at 0x11553fb50>),
 (u'father', <pyspark.resultiterable.ResultIterable at 0x11553fb90>),
 (u'over', <pyspark.resultiterable.ResultIterable at 0x11553fbd0>),
 (u'some', <pyspark.resultiterable.ResultIterable at 0x11553fc10>),
 (u'them', <pyspark.resultiterable.ResultIterable at 0x11553fc50>)]

In [13]:
index = df.flatMap(lambda row : [ (word,  row[0]) for word in row[1].split(' ') ] ).groupByKey().map(lambda x : (x[0], list(x[1])))
index.filter(lambda x : x[0] == 'father').collect()

[(u'father', [21512.0])]

In [14]:
index = df.flatMap(lambda row : [ (word,  row[0]) for word in row[1].split(' ') ] ).groupByKey().map(lambda x : (x[0], list(x[1])))
index.filter(lambda x : x[0] == 'mother').collect()

[(u'mother', [15452.0])]

# Spark DataFrames API

In [15]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
mov_df = sqlContext.read.format('com.databricks.spark.csv').options(delimiter='\t', header='false', inferSchema='true').load(os.getcwd() 
        + '/data/MovieSummaries/movie.metadata.tsv').selectExpr("C0 as wiki_id","C2 as movie_title", "C3 as release_date", "C4 as box_office_rev", "C5 as runtime"
                                                               ,"C6 as languages", "C7 as countries")

mov_df.show()

+--------+--------------------+------------+--------------+-------+--------------------+--------------------+
| wiki_id|         movie_title|release_date|box_office_rev|runtime|           languages|           countries|
+--------+--------------------+------------+--------------+-------+--------------------+--------------------+
|  975900|      Ghosts of Mars|  2001-08-24|      14010832|   98.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
| 3196793|Getting Away with...|  2000-02-16|          null|   95.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|28463795|         Brun bitter|        1988|          null|   83.0|{"/m/05f_3": "Nor...|{"/m/05b4w": "Nor...|
| 9363483|    White Of The Eye|        1987|          null|  110.0|{"/m/02h40lc": "E...|{"/m/07ssc": "Uni...|
|  261236|   A Woman in Flames|        1983|          null|  106.0|{"/m/04306rv": "G...|{"/m/0345h": "Ger...|
|13696889|       The Gangsters|  1913-05-29|          null|   35.0|{"/m/06ppq": "Sil...|{"/m/09c7w0": "Un...|
|18998739|

In [16]:
plot_df = sqlContext.read.format('com.databricks.spark.csv').options(delimiter="\t", header='false', inferSchema='true').load(os.getcwd() 
        + '/data/MovieSummaries/plot_summaries.txt').selectExpr("C0 as wiki_id", "C1 as plot")

plot_df.show()

+--------+--------------------+
| wiki_id|                plot|
+--------+--------------------+
|23890098|Shlykov, a hard-w...|
|31186339|The nation of Pan...|
|20663735|Poovalli Induchoo...|
| 2231378|The Lemon Drop Ki...|
|  595909|Seventh-day Adven...|
| 5272176|The president is ...|
| 1952976|{{plot}} The film...|
|24225279|The story begins ...|
| 2462689|Infuriated at bei...|
|20532852|A line of people ...|
|15401493|Lola  attempts to...|
|18188932|Milan and Goran a...|
| 2940516|Bumbling pirate c...|
| 1335380|The film is based...|
| 1480747|{{plot}} Followin...|
|24448645|Despite Lucy's re...|
|15072401|Alan Colby, heir ...|
| 4018288|Debbie's favorite...|
| 4596602|Ashes to Ashes is...|
|15224586|The film follows ...|
+--------+--------------------+
only showing top 20 rows



In [17]:
# turn movie data frame into table
sqlContext.registerDataFrameAsTable(mov_df,'movieMeta')
sqlContext.tableNames()

sqlContext.sql("select * from movieMeta order by release_date desc limit 20").show()

+--------+--------------------+------------+--------------+-------+--------------------+--------------------+
| wiki_id|         movie_title|release_date|box_office_rev|runtime|           languages|           countries|
+--------+--------------------+------------+--------------+-------+--------------------+--------------------+
|23124260|  Jeepers Creepers 4|  2016-06-08|          null|   null|                  {}|{"/m/09c7w0": "Un...|
|27554912|     Kung Fu Panda 3|  2016-03-18|          null|   null|                  {}|                  {}|
| 3139382|        Battle Angel|        2016|          null|   null|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|24179103|          I, Robot 2|        2015|          null|   null|                  {}|                  {}|
|32416903|       Total Dhamaal|        2015|          null|  138.0|{"/m/03k50": "Hin...|                  {}|
|25813358|            Avatar 2|        2015|          null|   null|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
| 2312141|

In [18]:
sqlContext.registerDataFrameAsTable(plot_df,'plotTerms')
sqlContext.tableNames()

sqlContext.sql("select * from plotTerms order by wiki_id limit 20").show()

+-------+--------------------+
|wiki_id|                plot|
+-------+--------------------+
|    330|In order to prepa...|
|   3217|After being pulle...|
|   3333| The film follows...|
|   3746|{{Hatnote}} In Lo...|
|   3837|In the American O...|
|   3947|Jeffrey Beaumont ...|
|   4227| :By What Means R...|
|   4231|Buffy Summers  is...|
|   4560|In the 13th centu...|
|   4726|As a child, Bruce...|
|   4727|When Batman  and ...|
|   4728|A deformed baby b...|
|   4729| In Gotham City, ...|
|   4730| In Gotham City, ...|
|   5035|Dr. Eric Vornoff ...|
|   5224|Charles Foster Ka...|
|   5313|Li Mu Bai  is an ...|
|   5729|In 1919, Harold A...|
|   7906|Saloon owner Kent...|
|   8481|Kate Miller is a ...|
+-------+--------------------+



In [19]:
sqlContext.sql("select * from movieMeta where wiki_id >4725 and wiki_id <4731 limit 20").show()

+-------+--------------+------------+--------------+-------+--------------------+--------------------+
|wiki_id|   movie_title|release_date|box_office_rev|runtime|           languages|           countries|
+-------+--------------+------------+--------------+-------+--------------------+--------------------+
|   4727|        Batman|  1966-07-30|          null|  104.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|   4729|Batman & Robin|  1997-06-12|     238207122|  124.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|   4728|Batman Returns|  1992-06-16|     266822354|  126.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|   4730|Batman Forever|  1995-06-09|     336529844|  122.0|{"/m/02h40lc": "E...|{"/m/09c7w0": "Un...|
|   4726|        Batman|  1989-06-19|     411348924|  126.0|{"/m/064_8sq": "F...|{"/m/09c7w0": "Un...|
+-------+--------------+------------+--------------+-------+--------------------+--------------------+



In [20]:
new_df = sqlContext.sql("select movie_title, words from movieMeta left outer join plotTerms")

sqlContext.registerDataFrameAsTable(new_df,'titleWord')
sqlContext.tableNames()

AnalysisException: u"cannot resolve 'words' given input columns wiki_id, release_date, movie_title, wiki_id, languages, runtime, box_office_rev, countries, plot;"

In [21]:
rdd = plot_df.rdd

rdd.take(1)

[Row(wiki_id=23890098, plot=u"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.")]

In [22]:
index = rdd.flatMap(lambda row : [ ( word, row[0]) for word in row[1].split(' ') ] ) 
index.take(50)

[(u'Shlykov,', 23890098),
 (u'a', 23890098),
 (u'hard-working', 23890098),
 (u'taxi', 23890098),
 (u'driver', 23890098),
 (u'and', 23890098),
 (u'Lyosha,', 23890098),
 (u'a', 23890098),
 (u'saxophonist,', 23890098),
 (u'develop', 23890098),
 (u'a', 23890098),
 (u'bizarre', 23890098),
 (u'love-hate', 23890098),
 (u'relationship,', 23890098),
 (u'and', 23890098),
 (u'despite', 23890098),
 (u'their', 23890098),
 (u'prejudices,', 23890098),
 (u'realize', 23890098),
 (u'they', 23890098),
 (u"aren't", 23890098),
 (u'so', 23890098),
 (u'different', 23890098),
 (u'after', 23890098),
 (u'all.', 23890098),
 (u'The', 31186339),
 (u'nation', 31186339),
 (u'of', 31186339),
 (u'Panem', 31186339),
 (u'consists', 31186339),
 (u'of', 31186339),
 (u'a', 31186339),
 (u'wealthy', 31186339),
 (u'Capitol', 31186339),
 (u'and', 31186339),
 (u'twelve', 31186339),
 (u'poorer', 31186339),
 (u'districts.', 31186339),
 (u'As', 31186339),
 (u'punishment', 31186339),
 (u'for', 31186339),
 (u'a', 31186339),
 (u'past

In [23]:
index = rdd.flatMap(lambda row : [ (word,  row[0]) for word in row[1].split(' ') ] ) \
            .groupByKey()
index.take(10)

[(u'', <pyspark.resultiterable.ResultIterable at 0x11410a350>),
 (u'considered,', <pyspark.resultiterable.ResultIterable at 0x11410a510>),
 (u'milestone;', <pyspark.resultiterable.ResultIterable at 0x11410a550>),
 (u'"humble,', <pyspark.resultiterable.ResultIterable at 0x11410a590>),
 (u'Keach', <pyspark.resultiterable.ResultIterable at 0x11410a5d0>),
 (u'1,800', <pyspark.resultiterable.ResultIterable at 0x11410a610>),
 (u'grimaces,', <pyspark.resultiterable.ResultIterable at 0x11410a650>),
 (u'transend', <pyspark.resultiterable.ResultIterable at 0x11410a690>),
 (u'|Jimmy', <pyspark.resultiterable.ResultIterable at 0x11410a6d0>),
 (u'machine],', <pyspark.resultiterable.ResultIterable at 0x11410a710>)]

In [None]:
index = rdd.flatMap(lambda row : [ (word,  row[0]) for word in row[1].split(' ') ] ) \
            .groupByKey() \
            .map(lambda x : (x[0], list(x[1]))).cache()

In [None]:
indices = index.filter(lambda x : x[0] == 'green').take(10)
tup = tuple(indices[0][1])

In [None]:
sqlContext.sql("select movie_title from movieMeta where wiki_id in " + str(tup) + " order by wiki_id ").show()

# Let's try this again

>this time we will bring it back to the basics. We will normalise the text by removing unwanted characters and converting to lowercase

In [14]:
import csv
import re

with open("data/MovieSummaries/plot_summaries.tsv") as f:
    r = csv.reader(f, delimiter='\t', quotechar='"')
    tag = re.compile(r'\b[0-9]+\b')
    rgx = re.compile(r'\b[a-zA-Z]+\b')
    #docs = [ (' '.join(re.findall(tag, x[0])).lower(), ' '.join(re.findall(rgx, x[1])).lower()) for i,x in enumerate(r) if r>1 ]
    docs= {}
    for i,x in enumerate(r):
        if i >1:
            docs[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[1])).lower()
#print(docs[0][0], docs[0][1])    

#item_t = [ d[0] for d in docs ] # item titles
#tem_d = [ d[1] for d in docs ] # item description
#item_i = range(0 , len(item_t)) # item id





> now to normalize the movie meta data to swap the item titles with index from above

** just the basics for now to get index, possibility to get genre if needed **

In [15]:
import csv
import re

with open("data/MovieSummaries/movie.metadata.tsv") as f:
    r = csv.reader(f, delimiter='\t', quotechar='"')
    tag = re.compile(r'\b[0-9]+\b')
    rgx = re.compile(r'\b[a-zA-Z]+\b')
    #docs2 = [ (' '.join(re.findall(tag, x[0])).lower(), ' '.join(re.findall(rgx, x[2])).lower()) for i,x in enumerate(r) if r>1 ]
    docs2= {}
    for i,x in enumerate(r):
        if i >1:
            docs2[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[2])).lower()
            
#print(docs2)

> now is the time to join the docs together

In [16]:
doc = [(docs2.get(x), y) for x, y in docs.items() if docs2.get(x)]



# for testing
# import random
 #print doc[random.randint(0, len(doc)-1)]
print doc[0][0], doc[0][1]

items_t = [ d[0] for d in doc ] # item titles
items_d = [ d[1] for d in doc ] # item description
items_i = range(0 , len(items_t)) # item id



periya idathu penn murugappa is a small time farm labourer who lives with his widowed sister gangamma in a village pillaival is the zamindar of the village and sabapathy and punitha are his children punitha is studying in college in a nearby town while sabapathy is not educated both the father and the children are both arrogant about their wealth and try to rule the villagers murugappa tries to question their authority and this leads to frequent clashes with the zamindar s family pichandi is a wealthy college mate of punitha who is crazy about her sabapathy falls in love with thillaiammal who has been informally enagaged to murugappa for a long time both pillaival and gangamma propose for her on the same day to avoid a direct clash with the zamindar her father says that he took a vow that his daughter would marry the winner of a silambam competition punitha promises to marry pichandi if he dopes a drink which murugappa drinks during the fight sabapathy wins the fight and marries thilak

# term freq

In [17]:
corpus = items_d[0:25]
print corpus

['murugappa is a small time farm labourer who lives with his widowed sister gangamma in a village pillaival is the zamindar of the village and sabapathy and punitha are his children punitha is studying in college in a nearby town while sabapathy is not educated both the father and the children are both arrogant about their wealth and try to rule the villagers murugappa tries to question their authority and this leads to frequent clashes with the zamindar s family pichandi is a wealthy college mate of punitha who is crazy about her sabapathy falls in love with thillaiammal who has been informally enagaged to murugappa for a long time both pillaival and gangamma propose for her on the same day to avoid a direct clash with the zamindar her father says that he took a vow that his daughter would marry the winner of a silambam competition punitha promises to marry pichandi if he dopes a drink which murugappa drinks during the fight sabapathy wins the fight and marries thilakam punitha goes b

>> start by computing frequncy of entire corpus

In [18]:
tf = {}
for doc in corpus:
    for word in doc.split():
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
print(tf)

{'baskar': 1, 'advices': 1, 'demanded': 1, 'protest': 1, 'captain': 3, 'offenses': 1, 'disability': 1, 'pensions': 1, 'bike': 1, 'under': 2, 'teaching': 1, 'merchant': 1, 'lack': 2, 'rise': 1, 'connects': 1, 'every': 1, 'confederate': 1, 'stabbed': 1, 'four': 1, 'school': 2, 'prize': 1, 'skills': 1, 'triumph': 1, 'force': 1, 'warns': 1, 'direct': 1, 'preacher': 1, 'second': 1, 'persuade': 1, 'even': 1, 'ruthless': 1, 'ned': 1, 'beaten': 1, 'corporation': 1, 'new': 8, 'increasing': 1, 'ever': 3, 'told': 2, 'hero': 1, 'whose': 1, 'men': 6, 'met': 2, 'protection': 1, 'china': 1, 'daughter': 7, 'employees': 1, 'pillaival': 8, 'browsing': 1, 'military': 1, 'changes': 2, 'golden': 1, 'secure': 1, 'amirthalingam': 2, 'brought': 1, 'guests': 1, 'tutelage': 1, 'unit': 1, 'sarah': 4, 'would': 3, 'army': 3, 'handedness': 1, 'chooses': 1, 'call': 2, 'survive': 2, 'tell': 1, 'coffins': 1, 'holy': 3, 'successful': 1, 'brings': 1, 'aware': 1, 'warn': 1, 'phone': 1, 'lord': 1, 'must': 4, 'shoot': 1, '

>> now that we have normailised the data we can compute the term frequency


In [19]:
from collections import Counter

def get_tf(corpus):
    tf = Counter()
    for doc in corpus:
        for word in doc.split():
            tf[word] += 1
    return tf

tf = get_tf(corpus)
print(tf)
    

Counter({'the': 332, 'to': 209, 'and': 184, 'a': 161, 'of': 129, 'his': 128, 'is': 120, 'in': 101, 'he': 82, 's': 72, 'with': 55, 'that': 50, 'for': 47, 'who': 35, 'as': 35, 'her': 34, 'at': 30, 'drake': 30, 'him': 30, 'an': 30, 'but': 30, 'their': 29, 'on': 28, 'has': 27, 'by': 26, 'they': 24, 'after': 21, 'it': 20, 'from': 20, 'are': 19, 'doughty': 19, 'when': 18, 'two': 17, 'william': 17, 'be': 16, 'father': 15, 'punitha': 14, 'into': 14, 'rudy': 14, 'while': 14, 'home': 14, 'one': 13, 'family': 13, 'colin': 13, 'find': 13, 'up': 13, 'she': 13, 'other': 13, 'was': 12, 'quantrill': 12, 'yuma': 12, 'wife': 12, 'love': 11, 'only': 11, 'get': 11, 'sister': 11, 'son': 10, 'not': 10, 'jp': 10, 'notre': 10, 'dame': 10, 'about': 10, 'death': 9, 'marcus': 9, 'during': 9, 'new': 8, 'pillaival': 8, 'each': 8, 'down': 8, 'time': 8, 'murugappa': 8, 'goes': 8, 'man': 8, 'falls': 8, 'sabapathy': 8, 'pichandi': 8, 'tries': 8, 'return': 8, 'both': 8, 'village': 8, 'out': 8, 'final': 8, 'which': 8, '

# doc freq
> 

In [20]:
import collections

def get_tf(document):
    tf = Counter()
    for word in document.split():
        tf[word] += 1
    return tf

def get_dtf(corpus):
    dtf = {}
    for i,doc in enumerate(corpus):
        dtf[i]= get_tf(doc)
    return dtf

dtf = get_dtf(items_t)
dtf[342]

Counter({'and': 1, 'give': 1, 'tyke': 1})

> compute dtf for item descriptions

In [21]:
dtf = get_dtf(items_d)
dtf[12]

Counter({'a': 10,
         'ability': 1,
         'accept': 1,
         'affection': 1,
         'after': 1,
         'against': 1,
         'an': 1,
         'and': 6,
         'are': 1,
         'aristocrat': 1,
         'aristocratic': 1,
         'aristocrats': 2,
         'army': 1,
         'as': 3,
         'at': 3,
         'aware': 1,
         'bankrupt': 1,
         'because': 1,
         'becomes': 2,
         'begins': 1,
         'bulgaria': 1,
         'business': 2,
         'but': 3,
         'cinema': 1,
         'cki': 1,
         'com': 1,
         'comes': 1,
         'company': 1,
         'condescended': 1,
         'consents': 1,
         'database': 1,
         'daughter': 1,
         'descendant': 1,
         'devotion': 1,
         'distressed': 1,
         'dreaming': 1,
         'during': 1,
         'edu': 1,
         'end': 1,
         'enterprising': 1,
         'eventual': 1,
         'exile': 2,
         'failed': 1,
         'falling': 1,
         'fam

# term freq matrix

In [22]:
def get_lexicon(corpus):
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return list(lexicon)

test_corpus = ['mountain bike', 'road bike carbon', 'bike helmet']
lexicon = get_lexicon(test_corpus)
print lexicon

['mountain', 'helmet', 'bike', 'road', 'carbon']


> with the lexicon we are able to compute the term freq matrix

In [23]:
def get_tfm(corpus):
    
    def get_lexicon(corpus):
        lexicon = set()
        for doc in corpus:
            lexicon.update([word for word in doc.split()])
        return list(lexicon)
    
    lexicon = get_lexicon(corpus)
    
    tfm =[]
    for doc in corpus:
        tfv = [0]*len(lexicon)
        for term in doc.split():
            tfv[lexicon.index(term)] += 1
    
        tfm.append(tfv)
    
    return tfm, lexicon

test_corpus = ['mountain bike', 'road bike carbon', 'bike helmet']
tfm, lexicon = get_tfm(test_corpus)
print lexicon
print tfm


    

['mountain', 'helmet', 'bike', 'road', 'carbon']
[[1, 0, 1, 0, 0], [0, 0, 1, 1, 1], [0, 1, 1, 0, 0]]


# sparsity of term frequency matrix

In [24]:
! pip install bokeh

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [25]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show, vplot

# sparsity as a function of document count
n = []
s = []
for i in range(100,1000,100):
    corpus = items_t[0:i]
    tfm, lexicon = get_tfm(corpus)
    c = [ [x.count(0), x.count(1)] for x in tfm]
    n_zero = sum([ y[0] for y in c])
    n_one = sum( [y[1] for y in c])
    s.append(1.0 - (float(n_one) / (n_one + n_zero)))
    n.append(i)
    
output_notebook(hide_banner=True)
p = figure(x_axis_label='Documents', y_axis_label='Sparsity', plot_width=400, plot_height=400)
p.line(n, s, line_width=2)
p.circle(n, s, fill_color="white", size=8)
show(p)

# boolean search


We are now in a position to write our first ranking function.  Now we have the term frequency matrix we can use it to find documents that contain words included in a user specified query.  We will start by simply returning the documents from the corpus that match any terms in the query and rank by the raw frequency of matching terms. 

More specifically our algorithm for 'boolean search' proceeds as follows:

* Compute the lexicon for the corpus
* Compute the term frequency matrix for the corpus
* Convert query to query vector using the same lexicon 
* Compare each documents term frequncy vector to the query vector - specifically for each document in the corpus:
    * Compute a ranking score for each document by taking the [dot product](https://en.wikipedia.org/wiki/Dot_product) of the document's term frequency vector and the query vector
* Sort the documents by ranking score

In [None]:
test_corpus = ['mountain bike red', 'road bike carbon', 'bike helmet']
print test_corpus

# compute term frequency matrix and lexicon
tfm, lexicon = get_tfm(corpus)

print tfm
print lexicon

# define our query
qry = 'red bike'

# convert query to query vector using lexicon
qrv = [0]*len(lexicon)
for term in qry.split():
    if term in lexicon:
        qrv[lexicon.index(term)] = 1

print qrv

# compare query vector to each term frequency vector
# this is dot product between qrv and each row of tfm
for i,tfv in enumerate(tfm):
    print i, sum([ xy[0] * xy[1] for xy in zip(qrv, tfv) ])

['mountain bike red', 'road bike carbon', 'bike helmet']
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

The function definition get_results_tf() computes the document ranking score for each document in the term frequency matrix

In [None]:
def get_results_tf(qry, tfm, lexicon):
    qrv =[0]*len(lexicon)
    for term in qry.split():
        if term in lexicon:
            qrv[lexicon.index(term)] = 1
            
    results = []
    for i, tfv in enumerate(tfm):
        score = 0
        score = sum([ xy[0] * xy[1] for xy in zip(qrv,tfv)])
        results.append([score, i])
    
    sorted_results = sorted(results, key=lambda t: t[0] * -1)
    return sorted_results


def print_results(results,n, head=True):
    ''' Helper function to print results
    '''
    if head:    
        print('\nTop %d from recall set of %d items:' % (n,len(results)))
        for r in results[:n]:
            print('\t%0.2f - %s'%(r[0],items_t[r[1]]))
    else:
        print('\nBottom %d from recall set of %d items:' % (n,len(results)))
        for r in results[-n:]:
            print('\t%0.2f - %s'%(r[0],items_t[r[1]]))
    

tfm, lexicon = get_tfm(items_d)
results = get_results_tf('kids save friends', tfm , lexicon)
print_results(results,10)