# OFFER RECOMMENDER PROTOTYPE

This notebook contains a prototype for an offer recommender which:

- Trains a model using the data in the skills data set.
- Uses the trained model to recommend offers from the offers data set.

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.cluster import KMeans

# Settings

Files containing the data sets and other subjective parameters:

**NOTE**: To speed up the process, the `NUMBER_OF_CATEGORIES` and the `NUMBER_OF_CLUSTERS` parameters can be reduced. This, however, lead to worse results.

In [None]:
FILE_SKILLS = "data/candidatetest_df_off_sk.csv"
FILE_OFFERS = "data/candidatetest_df_off_fixed.csv"
PATTERNS_BEGINNING = [" ", "+", "?", "#", "$", "-", "(", "*"]
PATTERNS_END = [" ", ")"]
NUMBER_OF_CATEGORIES = 100
NUMBER_OF_CLUSTERS = 200
NUMBER_OF_SIMILAR_OFFERS = 10

# Functions

In [None]:
def ReadOffers(file):
    '''
    Read the file containing the fixed offers data set.
    Args:
    - file: File with the fixed offers data set.
    Returns:
    - Pandas dataframe with tidy offers data set.
    '''
    data = pd.read_csv(file, delimiter = "|", encoding = "latin1", dtype = {"id": str})
    data = data.loc[data["requirement"] != "#Â¿NOMBRE?", :]
    data["studies"] = [str(s).replace(";", "") for s in data["studies"]]
    return data

def ReadSkills(file):
    '''
    Read the file containing the original skills data set.
    Args:
    - file: File with the fixed skills data set.
    Returns:
    - Pandas dataframe with tidy skills data set.
    '''
    data = pd.read_csv(file, delimiter = "|", encoding = "latin1")
    data = data.rename(columns={"O_ID": "id", "OSK_NOMBRE": "skill"})
    return data

def CleanSkill(skill, patternsBeginning = [], patternsEnd = [], lowerCase = True, stopWords = []):
    '''
    Perform cleaning to a skill by removing meaningless parts and using only lowercase.
    Args:
    - skill: String with the skill to clean.
    - patternsBeginning: List of patterns to remove from the beginning of the skill.
    - patternsEnd: List of patterns to remove from the end of the skill.
    - stopWords: List of words to eliminate.
    Returns:
    - String with the skill clean.
    '''
    skillClean = str(skill)
    skillClean = skillClean.lstrip("".join(patternsBeginning))
    skillClean = skillClean.rstrip("".join(patternsEnd))
    if lowerCase == True:
        skillClean = skillClean.lower()
    skillClean = " ".join([x for x in skillClean.split(" ") if x not in stopWords])
    skillClean = skillClean.replace(" +", " ")
    skillClean = skillClean.lstrip()
    skillClean = skillClean.rstrip()
    return skillClean

def CleanSkills(df, patternsBeginning, patternsEnd):
    '''
    Create a new column to a Pandas dataframe with clean skills.
    Args:
    - df: Pandas dataframe with a column called "skill".
    - patternsBeginning: List of patterns to remove from the beginning of the skill.
    - patternsEnd: List of patterns to remove from the end of the skill.
    Returns:
    - Pandas dataframe with a new column called "skill_clean".
    '''
    stopWords = stopwords.words("spanish") + stopwords.words("english")
    df["skill_clean"] = df["skill"].apply(lambda x: CleanSkill(x,
                                                               patternsBeginning = patternsBeginning,
                                                               patternsEnd = patternsEnd,
                                                               lowerCase = True,
                                                               stopWords = stopWords))
    return df

def CreateBagOfWords(listOfStrings):
    '''
    Create a Pandas dataframe containing the individual words present in the skills and
    the amount of times they appear.
    Args:
    - listOfStrings: List with all the skills.
    Returns:
    - Pandas dataframe with columns "word" and "occurrences".
    '''
    bowDict = {}
    for string in listOfStrings:
        words = string.split(" ")
        for word in words:
            word = word.lstrip("(")
            word = word.rstrip(")")
            if word in bowDict.keys():
                bowDict[word] = bowDict[word] + 1
            else:
                bowDict[word] = 1
    bowDf = pd.DataFrame.from_dict(bowDict, orient = "index", columns = ["occurrences"])
    bowDf = pd.DataFrame.sort_values(bowDf, "occurrences", ascending = False).reset_index()
    bowDf = bowDf.rename(columns = {"index": "word"})
    return  bowDf

def FeatureEngineering(df, categories):
    '''
    Create one-hot encoded variables based on the provided categories.
    Args:
    - df: Pandas dataframe with a column called "skill_clean".
    - categories: List of words to create the features.
    Returns:
    - Pandas dataframe with every feature in a column.
    '''
    df = df.copy()
    for c in categories:
        df.insert(len(df.columns), c, df.skill_clean.apply(lambda x: int(c in x.split(" "))))
    dfId = df.groupby("id").agg("sum").reset_index()
    dfId.iloc[:,1:(NUMBER_OF_CATEGORIES + 1)] = (dfId.iloc[:,1:(NUMBER_OF_CATEGORIES + 1)] != 0) * 1
    dfId = dfId.rename(columns = {"index": "id"})
    return dfId

def PrepareDataForKMeans(df):
    '''
    Transforms the Pandas dataframe to a Numpy array to feed the KMeans algorithm.
    Args:
    - df: Pandas dataframe with all the features needed by the model.
    Returns:
    - Numpy array prepared to feed the KMeans.
    '''
    array = np.array(df.drop(["id"], axis = 1), dtype = np.float32)
    return array

def FitModel(array, numberOfClusters):
    '''
    Creates the KMeans model algorithm.
    Args:
    - array: Numpy array with values of all the features.
    - numberOfClusters: Desired number of clusetrs to divide the observations in.
    Returns:
    - Dictionary with model algorithm and the parameters needed to use it.
    '''
    model = KMeans(n_clusters = numberOfClusters)
    model = model.fit(array)
    return model
    
def TrainClusteringModel(df, patternsBeginning, patternsEnd, numberOfCategories, numberOfClusters):
    '''
    Given the skills data set, it akes all the data transformations and trains
    the KMeans model algorithm.
    Args:
    - df: Skills data frame.
    - patternsBeginning: List of patterns to remove from the beginning of the skills.
    - patternsEnd: List of patterns to remove from the end of the skills.
    - numberOfCathegories: Number of features to create the model.
    - numberOfClusters: Desired number of clusetrs to divide the observations in.
    Returns:
    - Dictionary with model algorithm and the parameters needed to use it.
    '''
    df = df.copy()
    dfClean = CleanSkills(df, patternsBeginning, patternsEnd)
    bowDf = CreateBagOfWords(dfClean["skill_clean"])
    categories = bowDf.loc[0:numberOfCategories - 1, "word"].tolist()
    dfExtended = FeatureEngineering(dfClean, categories)
    array = PrepareDataForKMeans(dfExtended)
    model = FitModel(array, numberOfClusters)
    modelDict = {"model": model,
                 "patternsBeginning": patternsBeginning,
                 "patternsEnd": patternsEnd,
                 "categories": categories}
    return modelDict

def AssignClusterToOffers(offersDf, modelDict):
    '''
    Process the offers in the Pandas dataframe and assign a cluster to each of them.
    Args:
    - offersDf: Dataframe with offers.
    - modelDict: Dictionary with trained model and parameters used to compute it.
    Returns:
    - Pandas dataframe of offers with features and cluster.
    '''

    df = offersDf.copy()
    df = df[["id", "requirement"]]
    df = df.rename(columns={"requirement": "skill"})
    dfClean = CleanSkills(df, modelDict["patternsBeginning"], modelDict["patternsEnd"])
    dfExtended = FeatureEngineering(dfClean, modelDict["categories"])
    array = PrepareDataForKMeans(dfExtended)
    dfExtended["cluster"] = modelDict["model"].predict(array)
    return dfExtended

def AskForOffer():
    '''
    Ask to enter the skills for a new offer.
    Args:
    - Nothing.
    Returns:
    - String with the entered skills.
    '''
    skillsInputString = input("Add skills of the new offer separated by commas: ")
    return(skillsInputString)

def AssignClusterToNewOffer(skillsString, modelDict):
    '''
    Process the new offer to make a prediction and assign a cluster to it.
    Args:
    - skillsString: Skills string entered by the user.
    - modelDict: Dictionary with trained model and parameters used to compute it.
    Returns:
    - Pandas dataframe of new offer with features and cluster.
    '''
    skillsInputList = skillsString.split(", ")
    print("")
    print("Requested skills:")
    for i in skillsInputList:
        print("- " + i)
    print("")
    skillsInputDf = pd.DataFrame({"id": 42, "skill": skillsInputList})
    skillsInputDfClean = CleanSkills(skillsInputDf, modelDict["patternsBeginning"], modelDict["patternsEnd"])
    skillsInputDfExtended = FeatureEngineering(skillsInputDfClean, modelDict["categories"])
    array = PrepareDataForKMeans(skillsInputDfExtended)
    skillsInputDfExtended["cluster"] = modelDict["model"].predict(array)
    skillsInputDfExtended
    return skillsInputDfExtended    

def CalculateDistances(offersDf, offer):
    '''
    Calculate the distance in the multi-dimensional space of an offer to all offers
    in the offers data set.
    Args:
    - offersDf: Dataframe with offers and all the features.
    - offer: Dataframe with the new offer.
    Returns:
    - Pandas dataframe with distances.
    '''
    offersDf = offersDf.copy()
    categories = [s for s in list(offer.columns) if s not in ["id", "cluster"]]
    offerArray = np.array(offer[categories])[0]
    offersDf["distance"] = (offersDf[categories] - offerArray).pow(2).sum(1)
    offersDf = pd.DataFrame.sort_values(offersDf, "distance").reset_index()
    return offersDf
    
def FindSimilarOffers(offersWithCluster, modelDict, numberOfSimilarOffers):
    '''
    Ask for skills of a new offer and returns the more similar ones.
    Args:
    - offersWithCluster: Pandas dataframe with offers, all features and cluster.
    - modelDict: Dictionary with trained model and parameters used to compute it.
    - numberOfSimilarOffers: Amount of offers to show.
    Returns:
    - List of the most similar offer ids to the entered one.
    '''
    skillsInputString = AskForOffer()
    newOfferCluster = AssignClusterToNewOffer(skillsInputString, modelDict)
    cluster = newOfferCluster["cluster"][0]
    offersCluster = offersWithCluster.loc[offersWithCluster["cluster"] == cluster, ]
    closerOffers = CalculateDistances(offersCluster, newOfferCluster)
    recommendedIds = closerOffers.iloc[0:numberOfSimilarOffers, ]["id"]
    return recommendedIds

def ShowSimilarOffers(df, ids):
    '''
    Show in the screen the information about the provided offers.
    Args:
    - df: Dataframe with offers.
    - ids: List of offer ids to show.
    Returns:
    - Nothing.
    '''
    print("------------------------------------------")
    offersShow = df.loc[df["id"].isin(ids), ]
    for id in ids:
        offersId = offersShow.loc[df["id"] == id, ]
        name = offersId["name"].iloc[0]
        requirements = offersId["requirement"]
        studies = offersId["studies"].iloc[0]
        print("")
        print("Offer name:")
        print(" " + name)
        print("Required skills: ")
        for i in requirements:
            print(" " + str(i))
        print("Required level of studies:")
        print(" " + studies)
        print("")
        print("------------------------------------------")

# Data retrieval

In [None]:
skills = ReadSkills(FILE_SKILLS)
offers = ReadOffers(FILE_OFFERS)

# Model training

In [None]:
model = TrainClusteringModel(df = skills,
                             patternsBeginning = PATTERNS_BEGINNING,
                             patternsEnd = PATTERNS_END,
                             numberOfCategories = NUMBER_OF_CATEGORIES,
                             numberOfClusters = NUMBER_OF_CLUSTERS)

# Application

## Assign a cluster to each of the offers in the pull

In [None]:
offersWithCluster = AssignClusterToOffers(offers, model)

## Find similar offers to a given one

In [None]:
similarOffers = FindSimilarOffers(offersWithCluster, model, NUMBER_OF_SIMILAR_OFFERS)
ShowSimilarOffers(offers, similarOffers)

# Problems

- Some offers are never shown.
- Some skills are not captured so the model does not find suitable offers.

I would have better results if comparing the new offer with those used to train the model. For comparing with offers totally unseed by the training algorithm, results are not bad.

Also, training the model with more categories and more clusters gives better results, but it requires more computational capacity and time.

# More things to do!

- Improve model performance.
- Do feature engineering to find better variables.
- Test results with the unseen offers.
- Brainstorming with team.