In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import KNNBasic
from surprise import accuracy
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def calculate_similarity(test_section,test_description,rating):
    if test_section is None or pd.isnull(test_description):
       return 0
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(
       [test_section,test_description])
    similarity = (tfidf_matrix * 
                  tfidf_matrix.T).A[0, 1] * rating
    return similarity

In [3]:
jobs_df = pd.read_csv('jobs.csv',sep='\t')
ratings_df = pd.read_csv('ratings_section.csv')
sections_df = pd.read_csv('section.csv')


all_combinations = pd.MultiIndex.from_product(
    [sections_df['SectionId'], jobs_df['JobId']], 
    names=['SectionId', 'JobId'])
all_combinations_df = pd.DataFrame(
    index=all_combinations).reset_index()


merged_df = all_combinations_df.merge(
    ratings_df, on='SectionId', how='left')
merged_df = merged_df.merge(
    sections_df, left_on='SectionId',
    right_on='SectionId', how='left')
merged_df = merged_df.merge(
    jobs_df, left_on='JobId', right_on='JobId', how='left')
merged_df['DevelopmentPercentage'].fillna(0, inplace=True)


reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_df[['SectionName', 
    'Description', 'DevelopmentPercentage']], reader)
trainset, testset = train_test_split(
    data,test_size=0.2, random_state=42)
knn_model = KNNBasic(sim_options={'name': 'cosine',
    'user_based': False})
knn_model.fit(trainset)
content_model = SVD()
content_model.fit(trainset)
predictions = []
max_rating = merged_df[
    'DevelopmentPercentage'].max()
min_rating = merged_df[
    'DevelopmentPercentage'].min()

for test_section, test_description, test_rating in testset:
    knn_pred = knn_model.predict(test_section,
                        test_description, test_rating).est
    content_pred = content_model.predict(test_section, 
                        test_description,test_rating).est
    similarity_pred_content = calculate_similarity(test_section,test_description,content_pred)
    similarity_pred_knn = calculate_similarity(test_section,test_description,knn_pred)
    similarity_hybrid_pred = min((similarity_pred_content + similarity_pred_knn) / 2, 1.0)
    similarity_hybrid_pred = round(similarity_hybrid_pred, 1)
    section_rating = merged_df.loc[merged_df['SectionName'] == test_section,'DevelopmentPercentage'].iloc[0]
    normalized_rating = (section_rating - min_rating) / (max_rating - min_rating)
    similarity_hybrid_pred *= normalized_rating
    predictions.append((test_section, test_description, test_rating, similarity_hybrid_pred))

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [None]:
def truncate_text(text, max_words=50):
    words = text.split()
    if len(words) > max_words:
        return ' '.join(words[:max_words]) + '...'
    return text

In [6]:

df_predictions = pd.DataFrame(predictions, columns=['SectionName', 'Description', 'DevelopmentPercentage','Similarity'])
recommendations = merged_df[['JobId', 'JobName', 'URL', 'Location', 'Date', 'Company','Description']].merge(df_predictions, on='Description')
recommendations = recommendations.sort_values('Similarity', ascending=False)[['JobName','Location','Date', 'Company', 'Similarity']]
recommendations = recommendations.drop_duplicates(subset=['JobName'])
recommendations = recommendations.loc[recommendations['Similarity'] != 0.0]
top_10_recommendations = recommendations.head(10).copy()



styles = [
    dict(selector="th", props=[("font-size", "12pt"), ("text-align", "center"), ("font-weight", "bold"), ("color", "#6d6d6d"), ("background-color", "#f7f7f9")]),
    dict(selector="td", props=[("font-size", "11pt"), ("text-align", "left"), ("color", "#6d6d6d")]),
    dict(selector="tr:hover", props=[("background-color", "#f5f5f5")]),
    dict(selector="caption", props=[("caption-side", "bottom")])
]

# Aplicar estilo y mostrar la tabla
styled_table = (top_10_recommendations.style
                .set_table_styles(styles)
                .set_caption("Top 10 Recomendaciones de Trabajo")
                .format({'similarity_pred': '{:.2f}'})
                .hide_index())

# Mostrar la tabla
display(HTML(styled_table.to_html()))

JobName,Location,Date,Company,Similarity
Data Governance Advanced,"La Molina, Lima, Peru",1 week ago,Yape,0.22
Desarrollador full stack,"Lima, Peru",2 weeks ago,Tata Consultancy Services,0.176
Asistente de Analítica - Tarjeta W,"Lima, Lima, Peru",1 day ago,Corporación E. Wong,0.176
Analista Programador JAVA FullStack,"Miraflores, Lima, Peru",19 hours ago,Ventura Soluciones,0.176
Desarrollador Web (Especialista en WIX),"Lima, Lima, Peru",2 weeks ago,Altimea,0.176
Programador Android Nativo,"Magdalena del Mar, Lima, Peru",1 day ago,Retail Custom Solutions Perú,0.133333
Programador full stack,"San Isidro, Lima, Peru",1 day ago,Voila Agency,0.132
Fullstack Java Developer Senior,"Lima, Lima, Peru",1 day ago,VASS LATAM,0.132
[A] Fullstack Java Developer Senior,"Lima, Lima, Peru",2 days ago,VASS LATAM,0.132
Desarrollador Mobile IOS- Android Semi senior,"Miraflores, Lima, Peru",2 days ago,EVOL (TSnet),0.132
