In [1]:
import requests
import json

from ltr.client.solr_client import SolrClient

client = SolrClient()
host = client.get_host()

## Download, reindex...

1. Download the corpus & judgments
2. Rebuild the index from the tmdb solr config
3. Reindex movies loaded from the corpus

In [2]:
from ltr import download

tmdb_corpus='http://es-learn-to-rank.labs.o19s.com/tmdb_ai_pow_search.json'
judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt'
download([tmdb_corpus, judgments], dest='data/');

data/tmdb_ai_pow_search.json already exists
data/title_judgments_binary.txt already exists


In [3]:
from ltr.index import rebuild
from ltr.helpers.movies import indexable_movies
movies=indexable_movies(movies='data/tmdb_ai_pow_search.json')
rebuild(client, index='tmdb', doc_src=movies)

Reconfig from disk...
Deleted index tmdb [Status: 400]
{
  "responseHeader":{
    "status":400,
    "QTime":16},
  "error":{
    "metadata":[
      "error-class","org.apache.solr.common.SolrException",
      "root-error-class","org.apache.solr.common.SolrException"],
    "msg":"Cannot unload non-existent core [tmdb]",
    "code":400}}

Created index tmdb [Status: 200]
Reindexing...
Indexed 0 movies (last The Elusive Corporal)
Indexed 100 movies (last Mom and Dad)
Indexed 200 movies (last The Belle of New York)
Indexed 300 movies (last The Song of Bernadette)
Indexed 400 movies (last That Funny Feeling)
Indexed 500 movies (last Gold Diggers of 1935)
Indexed 600 movies (last WWE WrestleMania IV)
Indexed 700 movies (last The Trespasser)
Indexed 800 movies (last WWE: Hollywood Hulk Hogan - Hulk Still Rules)
Indexed 900 movies (last Bomba On Panther Island)
Indexed 1000 movies (last Redeemer)
Indexed 1100 movies (last Convict Cowboy)
Indexed 1200 movies (last The VelociPastor)
Indexed 1300 

Done [Status: 200]
Indexed 20000 movies (last Dreammaster: The Erotic Invader)
Indexed 20100 movies (last Spider-Man: Into the Spider-Verse)
Indexed 20200 movies (last Hell's Heroes)
Indexed 20300 movies (last Dying of the Light)
Indexed 20400 movies (last Midnight Sun)
Indexed 20500 movies (last The Angel Levine)
Indexed 20600 movies (last The Gamechangers)
Indexed 20700 movies (last The Bray Road Beast)
Indexed 20800 movies (last The Escort)
Indexed 20900 movies (last Darna vs. The Planet Women)
Indexed 21000 movies (last Big Easy Express)
Indexed 21100 movies (last In Security)
Indexed 21200 movies (last Almost There)
Indexed 21300 movies (last Mannequin)
Indexed 21400 movies (last Charlie Chan at the Race Track)
Indexed 21500 movies (last The Return of the Whistler)
Indexed 21600 movies (last Doctor Detroit)
Indexed 21700 movies (last Lil' Pimp)
Indexed 21800 movies (last Captain Horatio Hornblower R.N.)
Indexed 21900 movies (last Mad Mom)
Indexed 22000 movies (last Mean Girls 2)
I

Done [Status: 200]
Indexed 40000 movies (last The Accountant of Auschwitz)
Indexed 40100 movies (last The Photograph)
Indexed 40200 movies (last Banana Split)
Indexed 40300 movies (last Rites of Blood)
Indexed 40400 movies (last Rosie)
Indexed 40500 movies (last The Man with the Iron Fists)
Indexed 40600 movies (last A Wish Come True)
Indexed 40700 movies (last Sudden Death)
Indexed 40800 movies (last Never Trust a Gambler)
Indexed 40900 movies (last Othello)
Indexed 41000 movies (last Rusty: A Dog's Tale)
Indexed 41100 movies (last Kingpin)
Indexed 41200 movies (last The Two Mr. Kissels)
Indexed 41300 movies (last Surrounded)
Indexed 41400 movies (last Now & Later)
Indexed 41500 movies (last Dead Heat on a Merry-Go-Round)
Indexed 41600 movies (last AS:VS Back in Business)
Indexed 41700 movies (last Femme Fatale)
Indexed 41800 movies (last The Full Monty)
Indexed 41900 movies (last The Dentist)
Indexed 42000 movies (last Home Alone 3)
Indexed 42100 movies (last To Live and Die in L.A.)

Done [Status: 200]
Indexed 60000 movies (last Bela Kiss: Prologue)
Indexed 60100 movies (last Space)
Indexed 60200 movies (last Throw Down)
Indexed 60300 movies (last I am Von Höfler Variation on Werther)
Indexed 60400 movies (last Children of Nature)
Indexed 60500 movies (last Mei and the Kittenbus)
Indexed 60600 movies (last The Overcoat)
Indexed 60700 movies (last The Art of Crying)
Indexed 60800 movies (last Hanussen)
Indexed 60900 movies (last Chance Pe Dance)
Indexed 61000 movies (last Under the Sun of Satan)
Indexed 61100 movies (last Made in Britain)
Indexed 61200 movies (last Scandal)
Indexed 61300 movies (last Goofy Movies Number One)
Indexed 61400 movies (last The Human Condition II: Road to Eternity)
Indexed 61500 movies (last Crows and Sparrows)
Indexed 61600 movies (last The Noose)
Indexed 61700 movies (last Ill Gotten Gains)
Indexed 61800 movies (last Asier ETA biok)
Indexed 61900 movies (last Murder by Proxy:  How America Went Postal)
Indexed 62000 movies (last Number O

## Listing 1, View Doc

In [2]:
client.get_doc(index='tmdb', doc_id='37799', fields=['title','tagline','overview','release_year'])

{'title': ['The Social Network'],
 'overview': ['On a fall night in 2003, Harvard undergrad and computer programming genius Mark Zuckerberg sits down at his computer and heatedly begins working on a new idea. In a fury of blogging and programming, what begins in his dorm room as a small site among friends soon becomes a global social network and a revolution in communication. A mere six years and 500 million friends later, Mark Zuckerberg is the youngest billionaire in history... but for this entrepreneur, success leads to both personal and legal complications.'],
 'tagline': ["You don't get to 500 million friends without making a few enemies."],
 'release_year': 2010}

## Manual boosting

One 'generalizable' relevance solution that gets at the long tail is a manually derivved relevance function

In [3]:
q="""title:({keywords})^10
     overview:({keywords})^20
     {{!func}}release_year^0.01"""

q = q.format(keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title ',
          'q': q}

client.query(index='tmdb', query=solr_q)

[{'title': ['The Social Network']},
 {'title': ['Waxwork']},
 {'title': ['Mark Twain']},
 {'title': ['College Girls']},
 {'title': ['College']},
 {'title': ['Six: The Mark Unleashed']},
 {'title': ['Mark Shoots First']},
 {'title': ['College Swing']},
 {'title': ['The Adventures of Mark Twain']},
 {'title': ['College Humor']}]

In [4]:
q="""title:({keywords})^{ti_bm25_weight}
     overview:({keywords})^{ov_bm25_weight}
     {{!func}}release_year^{release_year_weight}"""

q = q.format(ti_bm25_weight=10,
             ov_bm25_weight=20,
             release_year_weight=0.01,
             keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title',
          'q': q}

client.query(index='tmdb', query=solr_q)

[{'title': ['The Social Network']},
 {'title': ['Waxwork']},
 {'title': ['Mark Twain']},
 {'title': ['College Girls']},
 {'title': ['College']},
 {'title': ['Six: The Mark Unleashed']},
 {'title': ['Mark Shoots First']},
 {'title': ['College Swing']},
 {'title': ['The Adventures of Mark Twain']},
 {'title': ['College Humor']}]

In [5]:

def judg_csv(judgment):
    from ltr.helpers.movies import get_movie
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.docId)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

# Make a baby judgment list for book display
from ltr.judgments import judgments_from_file
to_sample={11:[0,1,6,9], # qid->rows in qid to sample
           40:[0,5,12,7,20]}

mini_judg_list=[]

from itertools import groupby
judgment_dict={}
new_qid=1
with open('data/title_judgments_binary.txt') as f:
    for qid, query_judgments in groupby(judgments_from_file(f), key=lambda j: j.qid):
        if qid in to_sample.keys():
            query_judgments = [j for j in query_judgments]
            for row in to_sample[qid]:
                query_judgments[row].qid=new_qid
                mini_judg_list.append(query_judgments[row])
            new_qid+=1

mini_judg_list

Recognizing 65 queries...


[Judgment(grade=1,qid=1,keywords=social network,docId=37799,features=[],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=267752,features=[],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=38408,features=[],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=28303,features=[],weight=1,
 Judgment(grade=1,qid=2,keywords=star wars,docId=11,features=[],weight=1,
 Judgment(grade=1,qid=2,keywords=star wars,docId=1892,features=[],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=54138,features=[],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=85783,features=[],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=325553,features=[],weight=1]

In [6]:
def judg_csv(judgment):
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.docId)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

### As CSV

In [7]:
for judgment in mini_judg_list:
    print(judg_csv(judgment))

1,'The Social Network',social network
0,'#chicagoGirl',social network
0,'Life As We Know It',social network
0,'The Cheyenne Social Club',social network
1,'Star Wars',star wars
1,'Return of the Jedi',star wars
0,'Star Trek Into Darkness',star wars
0,'The Star',star wars
0,'Battlestar Galactica',star wars


### Dump the file...

In [8]:
from ltr.judgments import judgments_to_file
from io import StringIO

string_f = StringIO()
judgments_to_file(string_f, judgmentsList=mini_judg_list)

print(string_f.getvalue())

# qid:1: social network*1
# qid:2: star wars*1

1	qid:1	 # 37799	social network
0	qid:1	 # 267752	social network
0	qid:1	 # 38408	social network
0	qid:1	 # 28303	social network
1	qid:2	 # 11	star wars
1	qid:2	 # 1892	star wars
0	qid:2	 # 54138	star wars
0	qid:2	 # 85783	star wars
0	qid:2	 # 325553	star wars



### Same plausible features on each

In [9]:
# Setup some features for this dummy dataset
client.reset_ltr(index='tmdb')

ftr_config = [
    #1
    {
      "name" : "title_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #2
    {
      "name" : "overview_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {#3
      "name" : "release_year",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]


judgments_string=string_f.getvalue()
client.create_featureset(index='tmdb', name='dummy', ftr_config=ftr_config)

from ltr.judgments import judgments_reader
from ltr.log import FeatureLogger

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='dummy')
with judgments_reader(StringIO(judgments_string)) as judgments:
    for qid, query_judgments in groupby(judgments, key=lambda j: j.qid):
        ftr_logger.log_for_qid(qid=qid,
                               keywords=judgments.keywords(qid),
                               judgments=query_judgments)

Deleted dummy Featurestore [Status: 200]
Deleted movie Featurestore [Status: 200]
Deleted test Featurestore [Status: 200]
Created dummy feature store under tmdb: [Status: 200]
Recognizing 2 queries...
Searching tmdb [Status: 200]
Discarded 0 Keep 4
Searching tmdb [Status: 200]
Discarded 0 Keep 5


## Dump the training set

In [10]:
from ltr.judgments import judgments_writer
from io import StringIO

string_f = StringIO()
with judgments_writer(string_f) as writer:
    for j in ftr_logger.logged:
        writer.write(j)

print(string_f.getvalue())

# qid:1: social network*1
# qid:2: star wars*1

1	qid:1	1:18.135925	2:8.391596	3:2010.0 # 37799	social network
0	qid:1	1:0.0	2:13.237938	3:2013.0 # 267752	social network
0	qid:1	1:0.0	2:9.576859	3:2010.0 # 38408	social network
0	qid:1	1:7.5430527	2:6.839079	3:1970.0 # 28303	social network
1	qid:2	1:14.951998	2:0.0	3:1977.0 # 11	star wars
1	qid:2	1:0.0	2:4.3300323	3:1983.0 # 1892	star wars
0	qid:2	1:5.377082	2:0.0	3:2013.0 # 54138	star wars
0	qid:2	1:7.01165	2:0.0	3:1952.0 # 85783	star wars
0	qid:2	1:0.0	2:0.0	3:2003.0 # 325553	star wars



In [11]:
import requests

feature_set = [
    {
      "name" : "title_bm25",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : { #q=title:({$keywords})
        "q" : "title:(${keywords})"
      }
    },
    {
      "name" : "overview_bm25",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {
      "name" : "vote_average",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}vote_average"
      }
    }
   
]

requests.put('http://localhost:8983/solr/tmdb/schema/feature-store',
             json=feature_set)

<Response [200]>

In [12]:
logging_solr_query = {
    "fl": "id,title,[features store=movie efi.keywords=\"social network\"]",
    'q': "id:37799 OR id:267752 id:38408 OR id:28303", #social network graded documents
    'rows': 10,
    'wt': 'json'  
}

resp = requests.post('http://localhost:8983/solr/tmdb/select',
                     data=logging_solr_query)
resp.json()

{'responseHeader': {'status': 0,
  'QTime': 1,
  'params': {'q': 'id:37799 OR id:267752 id:38408 OR id:28303',
   'fl': 'id,title,[features store=movie efi.keywords="social network"]',
   'rows': '10',
   'wt': 'json'}},
 'response': {'numFound': 4,
  'start': 0,
  'docs': [{'id': '38408',
    'title': ['Life As We Know It'],
    '[features]': 'title_bm25=0.0,overview_bm25=9.576859,vote_average=6.8'},
   {'id': '28303',
    'title': ['The Cheyenne Social Club'],
    '[features]': 'title_bm25=7.5430527,overview_bm25=6.839079,vote_average=6.4'},
   {'id': '37799',
    'title': ['The Social Network'],
    '[features]': 'title_bm25=18.135925,overview_bm25=8.391596,vote_average=7.3'},
   {'id': '267752',
    'title': ['#chicagoGirl'],
    '[features]': 'title_bm25=0.0,overview_bm25=13.237938,vote_average=7.0'}]}}

In [1]:
from ltr.client.solr_client import SolrClient

client = SolrClient(host='http://localhost:8983/solr')

In [3]:
from ltr.judgments import Judgment

Judgment(qid=1,keywords='social network',doc_id=37799,grade=1)

Judgment(grade=1,qid=1,keywords=social network,doc_id=37799,features=[],weight=1

```
1	qid:1	1:18.135925	2:8.391596	3:2010.0 # 37799	social network
0	qid:1	1:0.0	2:13.237938	3:2013.0 # 267752	social network
0	qid:1	1:0.0	2:9.576859	3:2010.0 # 38408	social network
0	qid:1	1:7.5430527	2:6.839079	3:1970.0 # 28303	social network
1	qid:2	1:14.951998	2:0.0	3:1977.0 # 11	star wars
1	qid:2	1:0.0	2:4.3300323	3:1983.0 # 1892	star wars
0	qid:2	1:5.377082	2:0.0	3:2013.0 # 54138	star wars
0	qid:2	1:7.01165	2:0.0	3:1952.0 # 85783	star wars
0	qid:2	1:0.0	2:0.0	3:2003.0 # 325553	star wars
```


In [8]:
mini_judg_list=[
    Judgment(grade=1, qid=1, keywords='social network', doc_id=37799),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=267752),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=38408),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=28303),
    Judgment(grade=1, qid=2, keywords='star wars', doc_id=11),
    Judgment(grade=1, qid=2, keywords='star wars', doc_id=1892),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=54138),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=85783),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=325553),    
]

from ltr.judgments import judgments_writer
from io import StringIO

string_file = StringIO()
with judgments_writer(string_file) as writer:
    for j in mini_judg_list:
        writer.write(j)

print(string_f.getvalue())




# qid:1: social network*1
# qid:2: star wars*1

1	qid:1	 # 37799	social network
0	qid:1	 # 267752	social network
0	qid:1	 # 38408	social network
0	qid:1	 # 28303	social network
1	qid:2	 # 11	star wars
1	qid:2	 # 1892	star wars
0	qid:2	 # 54138	star wars
0	qid:2	 # 85783	star wars
0	qid:2	 # 325553	star wars



In [10]:
mini_judg_list[0].features

[]

In [14]:
from ltr.log import FeatureLogger
from ltr.judgments import judgments_open
from itertools import groupby

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='movie')
for qid, query_judgments in groupby(mini_judg_list, key=lambda j: j.qid):
    ftr_logger.log_for_qid(judgments=query_judgments, 
                           qid=qid)
        
ftr_logger.logged

Searching tmdb [Status: 200]
Discarded 0 Keep 4
Searching tmdb [Status: 200]
Discarded 0 Keep 5


[Judgment(grade=1,qid=1,keywords=social network,docId=37799,features=[18.135925, 8.391596, 7.3],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=267752,features=[0.0, 13.237938, 7.0],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=38408,features=[0.0, 9.576859, 6.8],weight=1,
 Judgment(grade=0,qid=1,keywords=social network,docId=28303,features=[7.5430527, 6.839079, 6.4],weight=1,
 Judgment(grade=1,qid=2,keywords=star wars,docId=11,features=[14.951998, 0.0, 8.2],weight=1,
 Judgment(grade=1,qid=2,keywords=star wars,docId=1892,features=[0.0, 4.3300323, 8.0],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=54138,features=[5.377082, 0.0, 7.3],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=85783,features=[7.01165, 0.0, 7.5],weight=1,
 Judgment(grade=0,qid=2,keywords=star wars,docId=325553,features=[0.0, 0.0, 8.1],weight=1]

In [16]:
from ltr import download
from ltr.log import FeatureLogger
from ltr.judgments import judgments_open
from itertools import groupby

judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt'
download([judgments], dest='data/')

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='movie')

with judgments_open('data/title_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(judgments=query_judgments, 
                               qid=qid,
                               keywords=judgment_list.keywords(qid))
        
ftr_logger.logged

data/title_judgments_binary.txt already exists
Recognizing 40 queries...
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 33
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 38
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 34
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 30
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep

[Judgment(grade=4,qid=1,keywords=rambo,docId=7555,features=[13.038148, 11.173398, 6.5],weight=1,
 Judgment(grade=3,qid=1,keywords=rambo,docId=1370,features=[11.056428, 12.652582, 6.0],weight=1,
 Judgment(grade=3,qid=1,keywords=rambo,docId=1369,features=[7.593794, 10.758981, 6.5],weight=1,
 Judgment(grade=2,qid=1,keywords=rambo,docId=13258,features=[0.0, 10.096009, 6.7],weight=1,
 Judgment(grade=4,qid=1,keywords=rambo,docId=1368,features=[0.0, 11.867074, 7.4],weight=1,
 Judgment(grade=1,qid=1,keywords=rambo,docId=31362,features=[0.0, 8.33506, 6.0],weight=1,
 Judgment(grade=1,qid=1,keywords=rambo,docId=61410,features=[0.0, 4.6697874, 6.9],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=319074,features=[0.0, 0.0, 6.0],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=10296,features=[0.0, 0.0, 6.4],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=35868,features=[0.0, 0.0, 5.4],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=131457,features=[0.0, 0.0, 6.3],weight=1,