In [1]:
import numpy as np
data = np.load("text/career_features.npz")
print(data.files)


['indices', 'indptr', 'format', 'shape', 'data']


In [2]:
import pandas as pd
import joblib
from scipy.sparse import save_npz

# Load the vectorizer you are using in app.py
vectorizer = joblib.load("text/tfidf_vectorizer.joblib")

# Load job data
df = pd.read_csv("dataset/datacleanJobstreet.csv")
desc_col = "descriptions" if "descriptions" in df.columns else "description"
X = vectorizer.transform(df[desc_col].fillna(""))

# Save the matrix properly
save_npz("text/career_features.npz", X)


In [None]:
index.html


<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  <title>Career Compass | Smart Career Recommendations</title>
  <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap">
  <link rel="stylesheet" href="styles.css"> 
</head>
<body>
  <div class="container">
    <!-- HEADER -->
    <header class="header">
      <div class="logo" onclick="showHomePage()">
        <div class="logo-icon">PH</div>
        <div class="logo-text">PathWise</div>
      </div>
    </header>

    <!-- HOME PAGE -->
    <section id="home-page" class="page-section active">
      <section class="cta">
        <h2 class="cta-title">Start Your Career Journey Today</h2>
        <p class="cta-subtitle">Join thousands of professionals who have found their perfect career match with Career Compass.</p>
        <button class="btn btn-primary" onclick="showUploadPage()">Upload Your Resume</button>
      </section>
    </section>

    <!-- UPLOAD + PROFILE + RESULTS -->
    <section id="upload-page" class="page-section">

      <!-- Upload Section -->
      <div class="card fade-in" id="upload-section">
        <div class="card-header"><div class="card-title"><div class="card-title-icon">1</div>Upload Your Resume</div></div>
        <div class="card-body">
          <div class="upload-area" id="upload-area">
            <h3 class="upload-title">Drag & Drop Your Resume</h3>
            <p class="upload-subtitle">or click to browse files</p>
            <span class="upload-formats">PDF, DOCX, TXT</span>
            <input type="file" id="file-upload" accept=".pdf,.docx,.doc,.txt">
          </div>
          <div class="progress-container" id="progress-container">
            <div class="progress-bar"><div class="progress" id="progress-bar"></div></div>
            <div class="progress-status">
              <span id="progress-status">Processing resume...</span>
              <span id="progress-percentage">0%</span>
            </div>
          </div>
        </div>
      </div>

      <!-- Profile Section -->
      <div class="card fade-in" id="profile-section" style="display: none;">
        <div class="card-header"><div class="card-title"><div class="card-title-icon">2</div>Complete Your Profile</div></div>
        <div class="card-body">
          <!-- Tabs -->
          <div class="tabs">
            <div class="tab active" data-tab="personal">Personal Info</div>
            <div class="tab" data-tab="skills">Skills & Expertise</div>
            <div class="tab" data-tab="experience">Experience</div>
            <div class="tab" data-tab="education">Education</div>
          </div>

          <!-- Tab Contents -->
          <div class="tab-content active" id="personal-tab">
            <div class="form-grid">
              <input class="form-input" id="full-name" placeholder="Full Name">
              <input class="form-input" id="email" placeholder="Email">
              <input class="form-input" id="phone" placeholder="Phone">
              <input class="form-input" id="location" placeholder="Location">
            </div>
          </div>

          <div class="tab-content" id="skills-tab">
            <div class="skills-container" id="skills-container"></div>
            <div class="skill-input-container">
              <input type="text" id="new-skill" class="form-input" placeholder="Add a skill (e.g., Python)">
              <button class="btn btn-primary" id="add-skill-btn">Add</button>
            </div>
            <input type="text" id="languages" class="form-input" placeholder="Languages">
          </div>

          <div class="tab-content" id="experience-tab">
            <textarea id="work-experience" class="form-input" placeholder="Work Experience"></textarea>
            <select id="job-level" class="form-input">
              <option value="Entry">Entry</option>
              <option value="Mid">Mid</option>
              <option value="Senior">Senior</option>
            </select>
            <input type="hidden" id="predicted-level">
          </div>

          <div class="tab-content" id="education-tab">
            <textarea id="education" class="form-input" placeholder="Education"></textarea>
          </div>

          <div class="btn-group">
            <button class="btn btn-secondary" id="back-to-upload-btn">Back</button>
            <button class="btn btn-primary" id="get-recommendations-btn">Get Career Recommendations</button>
          </div>
        </div>
      </div>

      <!-- Loading Section -->
      <div class="loading" id="loading">
        <div class="spinner"></div>
        <div class="loading-text">Analyzing your profile with AI</div>
        <div class="loading-subtext">Using NLP, TF-IDF, and BERT to find your perfect match</div>
      </div>

      <!-- Results Section -->
      <div class="card fade-in" id="results" style="display: none;">
        <div class="card-header"><div class="card-title"><div class="card-title-icon">3</div>Recommended Career Paths</div></div>
        <div class="card-body">
          <div id="job-listings"></div>
          <div class="pagination-wrapper">
            <button class="btn pagination-btn" id="back-to-profile-btn">← Back to Profile</button>
            <div>
              <button class="btn pagination-btn" id="prev-page-btn">Previous</button>
              <button class="btn pagination-btn" id="next-page-btn">Next</button>
            </div>
          </div>
        </div>
      </div>

      <!-- Job Detail View -->
      <div id="job-detail-view" class="card fade-in" style="display: none;">
        <div class="card-header"><div class="card-title"><div class="card-title-icon">J</div><span id="detail-title">Job Title</span></div></div>
        <div class="card-body">
          <p><strong>Company:</strong> <span id="detail-company"></span></p>
          <p><strong>Location:</strong> <span id="detail-location"></span></p>
          <p><strong>Match:</strong> <span id="detail-match"></span>%</p>
          <hr><p><strong>Description:</strong></p><p id="detail-description"></p>
          <div style="text-align:right;"><button class="btn pagination-btn" id="back-to-list-btn">← Back to Job List</button></div>
        </div>
      </div>

    </section>
  </div>

  <!-- JavaScript -->
  <script>
    const uploadArea = document.getElementById('upload-area');
    const fileUpload = document.getElementById('file-upload');
    const profileSection = document.getElementById('profile-section');
    const uploadSection = document.getElementById('upload-section');
    const loadingIndicator = document.getElementById('loading');
    const resultsSection = document.getElementById('results');
    const skillsContainer = document.getElementById('skills-container');
    const jobListings = document.getElementById('job-listings');

    function showUploadPage() {
      document.getElementById("home-page").classList.remove("active");
      document.getElementById("upload-page").classList.add("active");
    }

    function showHomePage() {
      document.getElementById("upload-page").classList.remove("active");
      document.getElementById("home-page").classList.add("active");
    }

    function addSkillTag(skill) {
      if (!skill) return;
      const existing = Array.from(skillsContainer.querySelectorAll('.skill-tag')).map(t => t.textContent.replace('×', '').trim());
      if (existing.includes(skill)) return;
      const tag = document.createElement('div');
      tag.className = 'skill-tag';
      tag.innerHTML = `${skill} <span class="skill-remove">×</span>`;
      tag.querySelector('.skill-remove').addEventListener('click', () => tag.remove());
      skillsContainer.appendChild(tag);
    }

    document.getElementById('add-skill-btn').onclick = () => {
      const input = document.getElementById('new-skill');
      const value = input.value.trim();
      if (value) {
        addSkillTag(value);
        input.value = '';
      }
    };

    fileUpload.onchange = e => {
      if (e.target.files.length) {
        const formData = new FormData();
        formData.append('resume', e.target.files[0]);
        fetch('/extract', { method: 'POST', body: formData })
        .then(res => res.json())
        .then(data => {
          document.getElementById('full-name').value = data.name || '';
          document.getElementById('email').value = data.email || '';
          document.getElementById('phone').value = data.phone || '';
          document.getElementById('location').value = data.location || '';
          document.getElementById('languages').value = data.languages || '';
          document.getElementById('work-experience').value = data.workExperience || '';
          document.getElementById('education').value = data.education || '';
          document.getElementById('job-level').value = data.job_level || 'Entry';
          document.getElementById('predicted-level').value = data.job_level || 'Entry';
          (data.skills || []).forEach(addSkillTag);
          (data.softSkills || '').split(',').forEach(s => addSkillTag(s.trim()));
          uploadSection.style.display = 'none';
          profileSection.style.display = 'block';
        }).catch(err => {
          console.error(err); alert("Failed to extract resume.");
        });
      }
    };

    document.getElementById('get-recommendations-btn').onclick = () => {
      const skills = Array.from(skillsContainer.querySelectorAll('.skill-tag')).map(t => t.textContent.replace('×', '').trim()).join(' ');
      const education = document.getElementById('education').value || '';
      const job_level = document.getElementById('job-level').value || '';
      const softSkills = '';

      profileSection.style.display = 'none';
      loadingIndicator.style.display = 'block';

      fetch('/recommend', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ skills, softSkills, education, job_level })
      })
      .then(res => res.json())
      .then(jobs => {
        loadingIndicator.style.display = 'none';
        resultsSection.style.display = 'block';
        displayJobRecommendations(jobs);
      })
      .catch(err => {
        console.error(err); alert("Recommendation failed.");
      });
    };

    document.getElementById('back-to-upload-btn').onclick = () => {
      profileSection.style.display = 'none';
      uploadSection.style.display = 'block';
    };

    document.getElementById('back-to-profile-btn').onclick = () => {
      resultsSection.style.display = 'none';
      profileSection.style.display = 'block';
    };

    document.getElementById('back-to-list-btn').onclick = () => {
      document.getElementById('job-detail-view').style.display = 'none';
      resultsSection.style.display = 'block';
      window.scrollTo({ top: 0, behavior: 'smooth' });
    };

    let currentPage = 1;
    const jobsPerPage = 10;

    function createPaginationControls(totalJobs) {
      const totalPages = Math.ceil(totalJobs / jobsPerPage);
      document.getElementById('prev-page-btn').disabled = currentPage === 1;
      document.getElementById('next-page-btn').disabled = currentPage === totalPages;
    }

    document.getElementById('prev-page-btn').onclick = () => {
      if (currentPage > 1) {
        currentPage--;
        displayJobRecommendations(window.recommendedJobs);
      }
    };

    document.getElementById('next-page-btn').onclick = () => {
      const totalPages = Math.ceil(window.recommendedJobs.length / jobsPerPage);
      if (currentPage < totalPages) {
        currentPage++;
        displayJobRecommendations(window.recommendedJobs);
      }
    };

    function displayJobRecommendations(jobs) {
      jobListings.innerHTML = '';
      const uniqueJobs = Array.from(new Set(jobs.map(JSON.stringify))).map(JSON.parse);
      window.recommendedJobs = uniqueJobs;
      const start = (currentPage - 1) * jobsPerPage;
      const end = start + jobsPerPage;
      const paginatedJobs = uniqueJobs.slice(start, end);

      paginatedJobs.forEach((job, index) => {
        const div = document.createElement('div');
        div.className = 'job-card-grid';
        div.innerHTML = `
          <div class="job-title">${job.job_title}</div>
          <div class="job-company">${job.company}</div>
          <div class="job-match-badge">${job.match_percent}% Match</div>
          <div class="btn-group">
            <button class="btn btn-secondary" onclick="showJobDetail(${start + index})">View Details</button>
          </div>
        `;
        jobListings.appendChild(div);
      });

      createPaginationControls(uniqueJobs.length);
    }

    function showJobDetail(index) {
      if (!window.recommendedJobs || !window.recommendedJobs[index]) {
        alert("Job details not available.");
        return;
      }
      const job = window.recommendedJobs[index];
      document.getElementById('detail-title').textContent = job.job_title;
      document.getElementById('detail-company').textContent = job.company;
      document.getElementById('detail-location').textContent = job.location;
      document.getElementById('detail-match').textContent = job.match_percent;
      document.getElementById('detail-description').textContent = job.description;
      resultsSection.style.display = 'none';
      document.getElementById('job-detail-view').style.display = 'block';
      window.scrollTo({ top: 0, behavior: 'smooth' });
    }

    document.querySelectorAll(".tab").forEach(tab => {
      tab.addEventListener("click", () => {
        document.querySelectorAll(".tab").forEach(t => t.classList.remove("active"));
        document.querySelectorAll(".tab-content").forEach(c => c.classList.remove("active"));
        tab.classList.add("active");
        const tabId = tab.getAttribute("data-tab");
        document.getElementById(tabId + "-tab").classList.add("active");
      });
    });
  </script>
</body>
</html>


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import joblib
import scipy.sparse
from scipy.sparse import hstack, csr_matrix, save_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from spacy.matcher import PhraseMatcher
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK & spaCy setup
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")
# Load dataset
career_data = pd.read_csv(r"C:\FYPCode\dataset\jobstreet.csv")

print(f"job data shape: {career_data.shape}")
display(career_data.head())

# Clean text columns

# To remove None value
career_data = career_data.dropna()

# Remove newlines
career_data = career_data.replace(r'\n', ' ', regex=True)

# Remove special characters for specific columns
columns_to_clean = ["job_title", "company", "descriptions", "location", "category", "subcategory", "type"]
career_data[columns_to_clean] = career_data[columns_to_clean].replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Date and Time (YYYY-MM-DD HH:MM:SS):
career_data["listingDate"] = pd.to_datetime(career_data["listingDate"]).dt.strftime("%Y-%m-%d %H:%M:%S")

# Location Cleaning
def clean_location(loc):
    if isinstance(loc, list):
        loc = ", ".join(loc)
    if not isinstance(loc, str) or not loc.strip():
        return "unknown"
    loc = loc.strip()
    loc = re.sub(r'[^a-zA-Z,\s]', '', loc)
    loc = re.sub(r'\s+', ' ', loc)
    loc = loc.lower()
    return loc

career_data["location"] = career_data["location"].apply(clean_location)

print(f"job data shape: {career_data.shape}")
display(career_data.head())

def extract_salary_range(salary_str):
    if isinstance(salary_str, str):
        salary_str = salary_str.replace('RM', '').replace('per month', '').strip()
        salary_str = re.sub(r'\xa0', ' ', salary_str)
        salary_range = salary_str.split('–') if '–' in salary_str else salary_str.split('-')
        if len(salary_range) == 2:
            try:
                min_salary = float(salary_range[0].replace(',', '').strip())
                max_salary = float(salary_range[1].replace(',', '').strip())
                return min_salary, max_salary
            except ValueError:
                return None, None
    return None, None

career_data["salary_min"], career_data["salary_max"] = zip(*career_data["salary"].apply(extract_salary_range))

# Lowercase all text columns
career_data = career_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Tokenization, Stopwords Removal, Lemmatization
def lemmatize_text(words):
    text = ' '.join(words)
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

career_data["descriptions"] = career_data["descriptions"].apply(lambda x: re.findall(r'\b\w+\b', x) if isinstance(x, str) else [])
career_data["descriptions"] = career_data["descriptions"].apply(lambda words: [word for word in words if word not in stop_words])
career_data["descriptions"] = career_data["descriptions"].apply(lemmatize_text)
career_data["descriptions"] = career_data["descriptions"].apply(lambda x: ' '.join(x))

print(f"job data shape: {career_data.shape}")
display(career_data.head())

# 3. NER (General Entities)
def get_entities_as_text(text):
    doc = nlp(text)
    entities = [f"{ent.text} ({ent.label_})" for ent in doc.ents]
    return ", ".join(entities) if entities else "No entities found"

career_data["NER_Entities"] = career_data["descriptions"].apply(get_entities_as_text)

display(career_data[["descriptions", "NER_Entities"]])

# Skill NER (custom PhraseMatcher)
with open("full_skills_list.txt", "r") as f:
    skills = [line.strip().lower() for line in f if line.strip()]

matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(skill) for skill in skills]
matcher.add("SKILLS", None, *patterns)

def extract_skills(text):
    doc = nlp(text.lower())
    matches = matcher(doc)
    return list(set([doc[start:end].text for _, start, end in matches]))

career_data["skills_extracted"] = career_data["descriptions"].apply(extract_skills)

career_data[["category", "subcategory"]] = career_data[["category", "subcategory"]].fillna("unknown")

display(career_data[["descriptions", "skills_extracted"]])

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
job_desc_tfidf = vectorizer.fit_transform(career_data["descriptions"])

# Encoding category and location
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
categorical_features = encoder.fit_transform(career_data[["category", "subcategory"]])
label_encoder = LabelEncoder()
career_data["location_encoded"] = label_encoder.fit_transform(career_data["location"])
location_feature = career_data[["location_encoded"]]

# Combine features
final_features = hstack([job_desc_tfidf, categorical_features, csr_matrix(location_feature.values)])
print("✅ Feature matrix shape:", final_features.shape)

# Save outputs
save_npz("career_features.npz", final_features)
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
save_npz("job_tfidf_matrix.npz", job_desc_tfidf)

print("✅ TF-IDF model and matrix saved.")


def classify_job_level(description):
    description = description.lower()
    if any(x in description for x in ["intern", "internship", "trainee", "junior", "entry", "assistant"]):
        return "Entry-Level"
    elif any(x in description for x in ["senior", "lead", "manager"]):
        return "Senior-Level"
    return "Mid-Level"

career_data["job_level"] = career_data["descriptions"].apply(classify_job_level)

# Print the results
print(career_data[["descriptions", "job_level"]].head())

# Encode the job_level labels
label_encoder = LabelEncoder()
career_data["job_level_encoded"] = label_encoder.fit_transform(career_data["job_level"])
print("🎯 Job level mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Visualization
plt.figure(figsize=(8, 6))
sns.countplot(x='job_level', data=career_data, palette='Set3')
plt.title("Job Level Distribution")
plt.show()

print("✅ All preprocessing completed.")

# Save cleaned dataset
career_data.to_csv("dataset/datacleanJobstreet.csv", index=False)


In [None]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.optim.lr_scheduler import StepLR
import time
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import pandas as pd

# Load your CSV
career_data = pd.read_csv("dataset/datacleanJobstreet.csv")

# Check class distribution
print("Original class distribution:", Counter(career_data['job_level_encoded']))

# Feature and label separation
X = career_data["descriptions"].astype(str)
y = career_data["job_level_encoded"]

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define custom Dataset for BERT
class CareerDataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer, max_len):
        self.descriptions = descriptions.fillna("No description available")
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, item):
        description = str(self.descriptions.iloc[item])  # Ensure it's a string
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def run_bert_undersampling(X, y, train_size, test_size, tokenizer, max_len=128, batch_size=16, epochs=5):
    # Split with stratify to maintain class distribution in splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, random_state=42, stratify=y)

    # Undersample training data
    rus = RandomUnderSampler(random_state=42)
    X_train_array = X_train.values.reshape(-1, 1)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train_array, y_train)

    # Convert back to pandas Series for your Dataset
    X_train_resampled = pd.Series(X_train_resampled.flatten())
    y_train_resampled = pd.Series(y_train_resampled)

    # Create Dataset and DataLoader for training and testing
    train_dataset = CareerDataset(X_train_resampled, y_train_resampled, tokenizer, max_len)
    test_dataset = CareerDataset(X_test, y_test, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Load pretrained BERT model for classification
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=len(y.unique()))
    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        scheduler.step()
        print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation on test set
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    print("\nUndersampling Results:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

# Undersampling with 90/10 split
run_bert_undersampling(X, y, train_size=0.9, test_size=0.1, tokenizer=tokenizer)

app.py

In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import torch
import spacy
from spacy.matcher import PhraseMatcher
from transformers import BertTokenizer, BertForSequenceClassification
import fitz  # PyMuPDF
import csv
import os

app = Flask(__name__)

# === Load spaCy and skill matcher ===
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
with open("full_skills_list.txt", "r") as f:
    skills = [line.strip().lower() for line in f if line.strip()]
patterns = [nlp(skill) for skill in skills]
matcher.add("SKILLS", None, *patterns)

def extract_skills(text):
    doc = nlp(text.lower())
    matches = matcher(doc)
    return list(set([doc[start:end].text for _, start, end in matches]))

# === Load job data ===
jobs = pd.read_csv("dataset/datacleanJobstreet.csv")

# === Load best performing BERT model ===
model_path = "bert_model_90/final_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

label_map = {
    0: "Entry-Level",
    1: "Mid-Level",
    2: "Senior-Level"
}

def predict_job_level(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return label_map.get(pred, "Unknown")

def match_jobs(resume_skills):
    results = []
    for _, row in jobs.iterrows():
        job_skills = extract_skills(row["descriptions"])
        match_score = len(set(resume_skills) & set(job_skills))
        if match_score > 0:
            results.append({
                "job_title": row["job_title"],
                "company": row["company"],
                "location": row["location"],
                "match_score": match_score
            })
    results.sort(key=lambda x: x["match_score"], reverse=True)
    return results[:5]

@app.route("/predict", methods=["POST"])
def predict():
    resume_text = request.form.get("resume_text")
    if not resume_text:
        return jsonify({"error": "Missing resume_text"}), 400

    predicted_level = predict_job_level(resume_text)
    extracted_skills = extract_skills(resume_text)
    matched_jobs = match_jobs(extracted_skills)

    return jsonify({
        "predicted_level": predicted_level,
        "resume_skills": extracted_skills,
        "top_matches": matched_jobs
    })

@app.route("/upload", methods=["POST"])
def upload_pdf():
    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files["file"]
    if file.filename == "":
        return jsonify({"error": "No file selected"}), 400

    # Extract text from PDF
    doc = fitz.open(stream=file.read(), filetype="pdf")
    resume_text = " ".join([page.get_text() for page in doc])

    # Predict and extract
    predicted_level = predict_job_level(resume_text)
    extracted_skills = extract_skills(resume_text)
    matched_jobs = match_jobs(extracted_skills)

    # Save to CSV log
    os.makedirs("logs", exist_ok=True)
    with open("logs/prediction_logs.csv", "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([file.filename, predicted_level, ", ".join(extracted_skills)])

    return jsonify({
        "predicted_level": predicted_level,
        "resume_skills": extracted_skills,
        "top_matches": matched_jobs
    })

if __name__ == "__main__":
    app.run(debug=True)
