In [80]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from xgboost import XGBRanker

In [45]:
df = pd.read_csv("G:\My Drive\projects\data_science\working\dataset\student_profiles_updated.csv")
df.head()

  df = pd.read_csv("G:\My Drive\projects\data_science\working\dataset\student_profiles_updated.csv")


Unnamed: 0,Name,10th marks,12th course,12th marks,grad course,grad stream,grad marks,skills,languages known,internships taken,additional certifications,DOB,Category
0,YOVITHA,62.9,,,,,,,"Punjabi, English, Hindi",,"Certificate in IFRS, Oracle Java Certification",2001-05-21,MBC
1,SUKUNNAN,93.4,Arts,85.0,BCom,Electrical Engineering,71.8,"Excel, Communication","Urdu, Hindi, Tamil",,,2004-08-18,BC
2,SHOORSEN,65.6,Arts,69.4,BTech,Mechanical Engineering,65.2,,,,,2003-06-30,BC
3,AADALALAGAN,76.2,Commerce,69.4,,,,,"Tamil, Gujarati",,,2001-07-25,BC
4,SKANDAPRASAD,86.2,Science,89.7,,,,,,Finance Intern at Local Firm,,2002-10-06,MBC


In [10]:
"""Internship Title:
Data Science Intern – Predictive Analytics

Requirements:

Must be a final-year B.Tech / B.Sc in Computer Science, IT, or related field

Knowledge of Python, Machine Learning, and Data Analysis

Familiarity with pandas, NumPy, and basic statistics

Understanding of data visualization tools (Matplotlib / Seaborn)

Bonus if you know SQL

Languages Preferred:

English (must)

Hindi or any regional language is a plus

Additional Notes:

Prior internships or certifications in Data Science will be considered an advantage

Location preference: Remote"""

skills = "Python, Machine Learning, Data Analysis, Pandas, Numpy, basic statistics, matplotlib, seaborn, SQL, Data Science"
mandatory_languages = "English"
accepted_languages = "Hindi"
needed_course = ["BTech","BSc", "BE"]
needed_stream = ["Computer Science", "IT", "AI-DS"]

In [46]:
df_filtered = df[df['grad course'].isin(needed_course) & df['grad stream'].isin(needed_stream)]
df_filtered = df_filtered[df_filtered['languages known'].str.contains(mandatory_languages, case=False, na=False)]

In [48]:
df_filtered.shape
#we have 190 rows

(190, 13)

In [49]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df_filtered["skills"] = df_filtered["skills"].fillna("")
candidate_embeddings = model.encode(df_filtered["skills"].tolist(), convert_to_tensor=True)
internship_embeddings = model.encode(skills, convert_to_tensor=True)
similarity_matrix = util.cos_sim(candidate_embeddings, internship_embeddings)
df_filtered["similarity_score"] = similarity_matrix.squeeze().tolist()
df_sorted = df_filtered.sort_values(by="similarity_score", ascending=False)
print(df_sorted[["Name", "skills", "similarity_score"]])

              Name            skills  similarity_score
43851  CHANDRABHAN  Java, Python, ML          0.523768
15153     THIYARAA            Python          0.459777
47502      NAESIKA            Python          0.459777
7431     SHARRVESH            Python          0.459777
30572  POIYAAMOLLI        ML, Python          0.447746
...            ...               ...               ...
13393   ELAYAKUMAR                            0.002421
13414     SRIKUMAR                            0.002421
39897     KAARIKAI   Public speaking         -0.019873
1388    SHREESANTH   Public speaking         -0.019873
11873       AMULYA   Public speaking         -0.019873

[190 rows x 3 columns]


In [50]:
df_filtered["internships taken"] = df_filtered["internships taken"].fillna("")
candidate_embeddings = model.encode(df_filtered["internships taken"].tolist(), convert_to_tensor=True)
internship_embeddings = model.encode(skills, convert_to_tensor=True)
similarity_matrix = util.cos_sim(candidate_embeddings, internship_embeddings)
df_filtered["intern_similarity_score"] = similarity_matrix.squeeze().tolist()

In [51]:
df_filtered["additional certifications"] = df_filtered["additional certifications"].fillna("")
candidate_embeddings = model.encode(df_filtered["additional certifications"].tolist(), convert_to_tensor=True)
internship_embeddings = model.encode(skills, convert_to_tensor=True)
similarity_matrix = util.cos_sim(candidate_embeddings, internship_embeddings)
df_filtered["certif_similarity_score"] = similarity_matrix.squeeze().tolist()

In [53]:
df_filtered = pd.get_dummies(df_filtered, columns=["Category"], prefix="cat")

In [54]:
df_filtered.drop(columns=["Name","12th course", "grad course", "grad stream", "DOB","skills", "internships taken", "additional certifications", "languages known"], inplace = True)

In [57]:
#customary score I was talking about
df_filtered["final_score"] = (
    df_filtered["10th marks"] * 0.10 +
    df_filtered["12th marks"] * 0.15 +
    df_filtered["grad marks"] * 0.20 +
    df_filtered["similarity_score"] * 0.25 +
    df_filtered["intern_similarity_score"] * 0.15 +
    df_filtered["certif_similarity_score"] * 0.15
)

df_filtered = df_filtered.sort_values("final_score", ascending=False).reset_index(drop=True)

In [72]:
total_seats = 20
open_seats = 10

reservation_quota = {
    "SC": 0.15,
    "ST": 0.075,
    "BC": 0.27,    
    "MBC": 0.10    
}

selected_general = df_filtered.head(open_seats)

remaining_candidates = df_filtered.iloc[open_seats:]

selected_reserved = []
for category, quota in reservation_quota.items():
    seats_for_category = int(total_seats * quota)
    category_candidates = remaining_candidates[remaining_candidates[f"cat_{category}"]]
    category_selected = category_candidates.head(seats_for_category)
    selected_reserved.append(category_selected)
    remaining_candidates = remaining_candidates.drop(category_selected.index)

final_selected = pd.concat([selected_general] + selected_reserved).reset_index(drop=True)


In [73]:
df_filtered["selected"] = 0
df_filtered.loc[selected_general.index, "selected"] = 1
for cat_df in selected_reserved:
    df_filtered.loc[cat_df.index, "selected"] = 1
print(df_filtered[["final_score", "selected"]])

     final_score  selected
0      42.669882         1
1      42.274704         1
2      41.687603         1
3      41.374765         1
4      41.363637         1
..           ...       ...
185    29.176748         0
186    29.170762         0
187    29.137248         0
188    29.034142         0
189    28.864758         0

[190 rows x 2 columns]


In [86]:
df_filtered.head()

Unnamed: 0,10th marks,12th marks,grad marks,similarity_score,intern_similarity_score,certif_similarity_score,cat_BC,cat_MBC,cat_OC,cat_SC,cat_ST,final_score,selected,predicted_score
0,86.9,97.7,96.5,0.002421,0.076419,0.085429,False,False,True,False,False,42.669882,1,2.470048
1,89.7,93.7,96.0,0.098984,0.163963,0.002421,False,True,False,False,False,42.274704,1,2.362594
2,87.9,93.7,93.9,0.177142,0.002421,0.119695,False,True,False,False,False,41.687603,1,2.268287
3,92.8,84.6,96.9,0.002421,0.076419,0.084645,False,True,False,False,False,41.374765,1,2.160254
4,97.6,81.5,96.4,0.248542,0.163963,0.079379,True,False,False,False,False,41.363637,1,2.025344


In [87]:
X = df_filtered.drop(columns=["final_score", "selected", "cat_OC", "cat_BC", "cat_MBC", "cat_SC", "cat_ST"])
y = df_filtered["final_score"]
print(X.shape, y.shape)
group = [X.shape[0]] 

(190, 7) (190,)


In [91]:
model = XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=6,
    random_state=42
)

model.fit(
    X, 
    y, 
    group=group
)

In [92]:
predicted_scores = model.predict(X)
df_filtered["predicted_score"] = predicted_scores
df_filtered_sorted = df_filtered.sort_values("predicted_score", ascending=False)

In [93]:
df_filtered_sorted.head(20)

Unnamed: 0,10th marks,12th marks,grad marks,similarity_score,intern_similarity_score,certif_similarity_score,cat_BC,cat_MBC,cat_OC,cat_SC,cat_ST,final_score,selected,predicted_score
0,86.9,97.7,96.5,0.002421,0.076419,0.085429,False,False,True,False,False,42.669882,1,2.36973
1,89.7,93.7,96.0,0.098984,0.163963,0.002421,False,True,False,False,False,42.274704,1,2.328938
2,87.9,93.7,93.9,0.177142,0.002421,0.119695,False,True,False,False,False,41.687603,1,2.261112
3,92.8,84.6,96.9,0.002421,0.076419,0.084645,False,True,False,False,False,41.374765,1,2.142753
4,97.6,81.5,96.4,0.248542,0.163963,0.079379,True,False,False,False,False,41.363637,1,2.027412
5,87.5,89.1,96.2,-0.019873,0.002421,-0.003568,True,False,False,False,False,41.34986,1,1.91811
6,89.6,89.1,92.6,-0.019873,0.002421,0.10274,False,False,False,True,False,40.855806,1,1.804209
7,84.1,96.8,88.0,0.392382,0.136296,0.002421,True,False,False,False,False,40.648903,1,1.710572
8,79.1,92.1,94.5,0.002421,0.115951,0.002421,True,False,False,False,False,40.643361,1,1.569245
9,85.7,84.4,94.9,0.133043,0.204078,0.078171,False,True,False,False,False,40.285598,1,1.470006
