In [5]:
import elasticsearch_dsl
from elasticsearch import Elasticsearch

client = Elasticsearch(hosts=['10.0.3.33:9200'])

In [116]:
import uuid
from faker import Faker
import random
fake = Faker()

student = {
    "id": uuid.uuid4(),
    "name": fake.name(),
    "rural": True,
    "underrepresented": True,
    "timezone": random.randint(-8, 4),
    "interestCompanies": ['Microsoft', "Google", fake.company(), fake.company()],
    "interestTags": ["javascript", "java", "python", "php"],
    "requireExtended": False
}

In [7]:
# background_rural = {
#     "term": {
#         "backgroundRural": {
#             "value": True
#         }
#     }
# }
#
# query_body = {
#     "query": {
#         background_rural
#     }
# }
#
# result = client.search(index="mentors_index", body=query_body)
# hits = result["hits"]["hits"

In [127]:
from elasticsearch_dsl import Q, SF
from elasticsearch_dsl.query import MatchNone, MatchAll

s = elasticsearch_dsl.Search(using=client, index='mentors_index').highlight("backgroundRural").extra(explain=True)

# Adds one to all queries in order to be sure that, in the worst case, there are enough responses.
base_value = Q("constant_score", filter=MatchAll())

# Uses a fuzzy query to determine if a person works for a company in the student's list, then if so adds to the score
company_q = None
for company in student["interestCompanies"]:
    if company_q is None:
        company_q = Q("function_score", query=Q("fuzzy", company=company), weight="1", boost_mode="replace")
    else:
        company_q = (company_q | Q("function_score", query=Q("fuzzy", company=company), weight="1", boost_mode="replace"))


# If background_rural matches on mentor and student, then adds one to the query
background_rural = Q("constant_score", filter=Q("term", backgroundRural=student['rural']))

# Adds number of matching tags to score
tags_matching = None
for interest in student['interestTags']:
    if tags_matching is None:
        tags_matching = Q("function_score", query=Q("term", proj_tags=interest), weight=1, boost_mode="replace")
    else:
        tags_matching = (tags_matching | Q("function_score", query=Q("term", proj_tags=interest), weight=1, boost_mode="replace"))

# If student is underrepresented, adds the value of prefer_student_underrep to the query
if student["underrepresented"]:
    prefer_student_underrep = Q({
        "function_score": {
            "field_value_factor" : {
                "field": "preferStudentUnderRep",
                "factor": 1,
                "modifier": "none",
                "missing": 0
            }
        }
    })
else:
    # Adds 0 to query if nothing is found
    prefer_student_underrep = Q("constant_score", filter=MatchNone())

if student["requireExtended"]:
    requireExtended = Q("term", okExtended=True)
    combined_query = ((base_value | tags_matching | company_q | background_rural | prefer_student_underrep) & requireExtended)
else:
    combined_query = (base_value | tags_matching | company_q | background_rural | prefer_student_underrep)


# Timezone - see docstring for python equivalent. Should be multiplied by the final score.
"""
if mentor['okTimezoneDifference']:
    if 16 < student['timezone'] < 22:
        return True
    return false
else:
    if abs(student['timezone'] - mentor['timezone']) < 3:
        return True
    return False
"""
combined_query = Q("function_score", query=combined_query, functions=[SF("script_score", script={"source": """
int student_tz = params.student_tz;
int mentor_tz = 0;
// Null check. Even though timezone is required, somehow some null rows snuck in and bamboozled me
if (doc['timezone'].size() == 0) {
    mentor_tz = 0;
} else {
    mentor_tz = (int)doc['timezone'].value;
}
int diff = student_tz - mentor_tz;

boolean mentor_ok_tz_diff = false;
if (doc['okTimezoneDifference'].size() == 0) {
    mentor_ok_tz_diff = false;
} else {
    mentor_ok_tz_diff = doc['okTimezoneDifference'].value;
}

if (mentor_ok_tz_diff == true) {
    if (student_tz < 22) {
        if (student_tz > 16) {
            // Mentor is OK with the time difference and student has a large time difference
            return 1;
        }
    } else {
        // Mentor is ok with time difference and student has a normal time
        return 0.75;
    }
} else {
    if (diff <= 3) {
        // Mentor is not ok with time difference and student has normal time
        return 1;
    } else {
        // Mentor is not ok with time difference and student has weird time
        return 0;
    }
}
""", "params": {"student_tz": student['timezone']}})], boost_mode="multiply", score_mode="sum")


s = s.query(combined_query)
print(s.to_dict())
resp = s.execute()
hits = resp["hits"]

{'query': {'function_score': {'query': {'bool': {'should': [{'function_score': {'query': {'term': {'proj_tags': 'javascript'}}, 'weight': 1, 'boost_mode': 'replace'}}, {'function_score': {'query': {'term': {'proj_tags': 'java'}}, 'weight': 1, 'boost_mode': 'replace'}}, {'function_score': {'query': {'term': {'proj_tags': 'python'}}, 'weight': 1, 'boost_mode': 'replace'}}, {'function_score': {'query': {'term': {'proj_tags': 'php'}}, 'weight': 1, 'boost_mode': 'replace'}}, {'constant_score': {'filter': {'match_all': {}}}}, {'function_score': {'query': {'fuzzy': {'company': 'Microsoft'}}, 'weight': '1', 'boost_mode': 'replace'}}, {'function_score': {'query': {'fuzzy': {'company': 'Google'}}, 'weight': '1', 'boost_mode': 'replace'}}, {'function_score': {'query': {'fuzzy': {'company': 'Peterson, Bonilla and Washington'}}, 'weight': '1', 'boost_mode': 'replace'}}, {'function_score': {'query': {'fuzzy': {'company': 'Horn Inc'}}, 'weight': '1', 'boost_mode': 'replace'}}, {'constant_score': {'fi