# Quora Question pairs


This notebook contains the functions built by *Jordi Segura*.

---
---

In [3]:
import pandas as pd
import scipy
import numpy as np
import os
import re # To do ReGeX
# !pip install spacy
import spacy # For NER
nlp = spacy.load('en_core_web_sm')
# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz # To compute similiraties.




In [4]:
path_data =  os.path.expanduser('~') 

# use this to train and VALIDATE your solution
train_df = pd.read_csv("./quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./quora_test_data.csv")

In [5]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1


## Text Normalization
Text normalization refers to the process of transforming raw text data into a standardized form, which can include tasks such as converting all text to lowercase, replacing contractions with their expanded form, and replacing common abbreviations or acronyms with their full form.

In [6]:
train_df.dropna(subset=['question1', 'question2'], inplace=True)

### NER - Name Entity Recognition

We chose these entity types because they are commonly relevant to the types of questions and answers in the Quora competition.

- PERSON: Refers to names of people, which may be important in questions and answers that involve people or personalities.
- GPE (Geo-Political Entity): Refers to names of countries, cities, and other geopolitical entities, which may be important in questions and answers that involve locations or politics.
- LOC (Location): Refers to other location names, which may be important in questions and answers that involve places or travel.
- DATE: Refers to dates, which may be important in questions and answers that involve historical events, schedules, or timeframes.
- TIME: Refers to times, which may be important in questions and answers that involve schedules or specific moments.
- MONEY: Refers to monetary values, which may be important in questions and answers that involve finance or pricing.
- ORG: Refers to names of organizations or companies, which may be important in questions and answers that involve business or industries.

In [7]:
def mask_entities(text):
    """
    Masks named entities of types PERSON, GPE, LOC, DATE, TIME, MONEY, and ORG with their respective entity labels.

    Args:
    text (str): The input text to be masked.

    Returns:
    str: The text with named entities masked.
    """
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ['PERSON', 'GPE', 'LOC', 'DATE', 'TIME', 'MONEY', 'ORG']:
            text = text.replace(ent.text, f'<{ent.label_}>')
    return text

In [12]:
print(f"From:=>\n  {train_df.loc[305]['question2']} <-TO-> {mask_entities(train_df.loc[305]['question2'])}") # ojo. va millor sense fer lower segurament.

From:=>
  Is the Pulsar 200 NS officially discontinued? <-TO-> Is the <ORG> 200 <GPE> officially discontinued?


In [13]:
train_df['question1_ner'] = train_df['question1'].apply(lambda x: mask_entities(x))

In [None]:
train_df['question2_ner'] = train_df['question2'].apply(lambda x: mask_entities(x))

### Contractions and abreviations 

In [14]:
# Dictionary of common contractions and their expanded form
contractions_dict = {
    "ain't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Dictionary of common abbreviations and their full form
abbreviations_dict = {
    "aka": "also known as",
    "asap": "as soon as possible",
    "btw": "by the way",
    "etc": "et cetera",
    "e.g.": "for example",
    "i.e.": "that is",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "thx": "thanks",
    "wtf": "what the fuck"
}

def normalize_text(text, contractions_dict, abbreviations_dict):
    try:
        # Convert text to lowercase
        text = text.lower()

        # Expand contractions
        for contraction, expansion in contractions_dict.items():
            text = re.sub(r"\b" + contraction + r"\b", expansion, text)

        # Replace abbreviations
        for abbreviation, full_form in abbreviations_dict.items():
            text = re.sub(r"\b" + abbreviation + r"\b", full_form, text)

        return text
    except:
        # NANs
        print(text)
        return ''

In [15]:
print(f"From:=>\n {train_df['question1'][71]} <-TO-> {normalize_text(train_df['question1'][71], contractions_dict, abbreviations_dict)}")

From:=>
 What's the best thing to ever happen to you? <-TO-> what is the best thing to ever happen to you?


In [95]:
train_df['question1_norm'] = train_df['question1_ner'].apply(lambda x: normalize_text(x, contractions_dict, abbreviations_dict))

In [96]:
train_df['question2_norm'] = train_df['question2_ner'].apply(lambda x: normalize_text(x, contractions_dict, abbreviations_dict))

## Feature Engineering

In [42]:
def first_word_equal(row):
    """Computes whether the first word of the two questions are equal.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        A binary value indicating whether the first word of the two questions are equal.
    """
    q1_words = row['question1'].split()
    q2_words = row['question2'].split()
    return int(q1_words[0] == q2_words[0])

def last_word_equal(row):
    """Computes whether the last word of the two questions are equal.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        A binary value indicating whether the last word of the two questions are equal.
    """
    q1_words = row['question1'].split()
    q2_words = row['question2'].split()
    return int(q1_words[-1] == q2_words[-1])

def common_words_count(row):
    """Computes the number of common words between the two questions.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        An integer value indicating the number of common words between the two questions.
    """
    q1_words = row['question1'].split()
    q2_words = row['question2'].split()
    common_words = set(q1_words).intersection(set(q2_words))
    return len(common_words)

def common_words_ratio(row):
    """Computes the ratio of common words between the two questions to the total number of words in both questions.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        A float value indicating the ratio of common words between the two questions to the total number of words in both questions.
    """
    q1_words = row['question1'].split()
    q2_words = row['question2'].split()
    common_words = set(q1_words).intersection(set(q2_words))
    return len(common_words) / (len(q1_words) + len(q2_words))

def fuzz_ratio(row):
    """Computes the fuzzy string matching ratio between the two questions.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        An integer value indicating the fuzzy string matching ratio between the two questions.
    """
    return fuzz.ratio(row['question1'], row['question2'])

def longest_substring_ratio(row):
    """Computes the ratio of the length of the longest common substring between the two questions to the length of the shorter question.

    Args:
        row: A pandas Series containing the 'question1' and 'question2' columns.

    Returns:
        A float value indicating the ratio of the length of the longest common substring between the two questions to the length of the shorter question.
    """
    # Extract the values of 'question1' and 'question2' from the input row
    q1 = row['question1']
    q2 = row['question2']
    # If q1 is longer than q2, swap their values
    if len(q1) > len(q2):
        q1, q2 = q2, q1
    # Compute the length of q1 and create an empty list to store the substring scores
    len_q1 = len(q1)
    substr_scores = []
    # Iterate over all possible substrings of q1
    for i in range(len_q1):
        for j in range(i+1, len_q1+1):
            # Extract the substring from q1 and compute its ratio score with q2
            substr = q1[i:j]
            substr_scores.append(fuzz.ratio(substr, q2) / len(substr))
    # Return the maximum score in the list of substring scores
    return max(substr_scores)


In [16]:
df = train_df

In [18]:
# Compute the extra features and add them to the dataframe
df['first_word_equal'] = df.apply(first_word_equal, axis=1)
df['last_word_equal'] = df.apply(last_word_equal, axis=1)
df['common_words_count'] = df.apply(common_words_count, axis=1)
df['common_words_ratio'] = df.apply(common_words_ratio, axis=1)

# EXPENSIVE
df['fuzz_ratio'] = df.apply(fuzz_ratio, axis=1)
df['longest_substring_ratio'] = df.apply(longest_substring_ratio, axis=1)
