<a href="https://colab.research.google.com/github/ghadirN/AI-Bootcamp/blob/main/AI_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI Bootcamp Project

---

**Import Libraries**

In [18]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

#for models:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

**Importing The Dataset**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("likithagedipudi/linkedin-compatibility-dataset-50k-profiles")

print("Path to dataset files:", path)
#from google.colab import drive
#drive.mount('/content/drive')

Downloading from https://www.kaggle.com/api/v1/datasets/download/likithagedipudi/linkedin-compatibility-dataset-50k-profiles?dataset_version_number=1...


100%|██████████| 278M/278M [00:02<00:00, 110MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/likithagedipudi/linkedin-compatibility-dataset-50k-profiles/versions/1


In [None]:
#joining path with the specific filenames in the dataset
profiles_path = os.path.join(path, "profiles.csv")
pairs_path = os.path.join(path, "compatibility_pairs.csv")

#Load the data
profiles = pd.read_csv(profiles_path)
pairs = pd.read_csv(pairs_path)

print("Datasets loaded sucessfully from Kaggle!")

Datasets loaded sucessfully from Kaggle!


In [None]:
#import pandas as pd

#profiles = pd.read_csv(
 #   '/content/drive/MyDrive/ML_LinkedIn_project/profiles.csv'
#)

#pairs = pd.read_csv(
 #   '/content/drive/MyDrive/ML_LinkedIn_project/compatibility_pairs.csv'
#)

#print(profiles.shape)
#print(pairs.shape)


(50000, 20)
(1048575, 14)


**Check If Data Is Loaded**

In [None]:
print("Profiles:")
print(profiles.head())
print(profiles.shape)

print("\nPairs:")
print(pairs.head())
print(pairs.shape)

print("Data is loaded sucessfully!")

Profiles:
                         profile_id           name  ... remote_preference     source
0  ab04b973af478550ddf247879393df42   Daniel Doyle  ...            remote  synthetic
1  b620e3fa2ec361b1d728115eeabb71af  Jennifer Cole  ...            hybrid  synthetic
2  cfeeb31581a0b3e0515c01691b9dc2b5   Brent Abbott  ...            onsite  synthetic
3  5d54826665a5898662661a96719cc4a7    Corey Jones  ...            onsite  synthetic
4  6ad3c64c6cb4bac60b692f3d5bab271d   Timothy Wong  ...            remote  synthetic

[5 rows x 20 columns]
(50000, 20)

Pairs:
   skill_match_score  ...                      profile_b_id
0           0.000000  ...  fdf3243d1ad97255e0ce313aebc0be79
1           5.555556  ...  371bc2adbdc4ca8f0dd16d373f85f2ae
2           5.555556  ...  d18e7cd91fc4e621fd6879ca5ef6e1b2
3           8.000000  ...  6615dabdd3b5b8f9627ca933dc3d9ae3
4           7.142857  ...  fc73d0e790dea954d20db176f638ab86

[5 rows x 14 columns]
(4999890, 14)
Data is loaded sucessfully!


**Pre-Merge Cleaning**

In [None]:
#Drop unnecessary columns from profiles
cols_to_keep_pro = ['profile_id', 'industry', 'years_experience', 'seniority_level', 'location', 'skills']
profiles = profiles[cols_to_keep_pro].copy()

#Drop unnecessary columns from compatibilities
cols_to_keep_comp = ['profile_a_id', 'profile_b_id', 'skill_match_score', 'career_alignment_score', 'experience_gap', 'seniority_match', 'compatibility_score']
pairs = pairs[cols_to_keep_comp].copy()

print("Columns selected!")

Columns selected!


In [None]:
#Handlle nulls
#Fill years of experience with median
profiles['years_experience'] = profiles['years_experience'].fillna(profiles['years_experience'].median())

#Standarize Text
profiles['industry'] = profiles['industry'].str.lower().str.strip()
profiles['location'] = profiles['location'].str.lower().str.strip()

print("Profiles cleaned! Shape: ", profiles.shape)

#check if there are any nulls in the target
pairs = pairs.dropna(subset=['compatibility_score'])

Profiles cleaned! Shape:  (50000, 6)


**The Merging Phase**

In [None]:
#Merge for profile A
#connect profile_a_id from pairs to profile_id from profiles
df_merged = pairs.merge(profiles, left_on='profile_a_id', right_on='profile_id')

#Merge for profile B
#suffixes _A and _B automatically differentiate the columns
df_final = df_merged.merge(profiles, left_on='profile_b_id', right_on='profile_id', suffixes=('_A','_B'))

#Clean: drop the extra ID columns created during merge
df_final = df_final.drop(columns=['profile_id_A', 'profile_id_B'])

print("Double Merge Successful! Shape:", df_final.shape)
print("Columns available now:", df_final.columns.tolist())

Double Merge Successful! Shape: (4999890, 17)
Columns available now: ['profile_a_id', 'profile_b_id', 'skill_match_score', 'career_alignment_score', 'experience_gap', 'seniority_match', 'compatibility_score', 'industry_A', 'years_experience_A', 'seniority_level_A', 'location_A', 'skills_A', 'industry_B', 'years_experience_B', 'seniority_level_B', 'location_B', 'skills_B']


**Post-Merging Phase**

In [None]:
#Drop rows where profiles are missing
df_final = df_final.dropna()

#Binary match features
df_final['same_industry'] = (df_final['industry_A'] == df_final['industry_B']).astype(int)
df_final['same_location'] = (df_final['location_A'] == df_final['location_B']).astype(int)

#Skill overlapp calculation
#turning the list of skills into nums to represent how many they share
def calculate_overlap(row):
  #find common skills using set instruction
  set_a = set(str(row['skills_A']).lower().replace('[','').replace(']','').split(','))
  set_b = set(str(row['skills_B']).lower().replace('[','').replace(']','').split(','))
  return len(set_a.intersection(set_b))

df_final['skills_overlap_count'] = df_final.apply(calculate_overlap, axis=1)

print("Features engineereg! Total rows for modeling:", len(df_final))

#

Features engineereg! Total rows for modeling: 4999890
