In [1]:
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib

In [2]:
random.seed(42)

In [3]:
df = pd.read_csv('skills.csv')

In [4]:
X = df['skills']
y = df['profession']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
vectorizer = TfidfVectorizer(token_pattern=r'[^,]+', lowercase=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
model = RandomForestClassifier(
    n_estimators=100,    
    max_depth=25,        
    random_state=42,
    n_jobs=-1           
)

In [8]:
model.fit(X_train_tfidf, y_train)

In [9]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

In [10]:
accuracy

0.947

In [11]:
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [12]:
user_input = "HTML, CSS"

# 3. –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Ç–µ–∫—Å—Ç –∏ –¥–µ–ª–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
X_input = vectorizer.transform([user_input])
predicted_profession = model.predict(X_input)[0]

print(f"–ü–æ–¥—Ö–æ–¥—è—â–∞—è –ø—Ä–æ—Ñ–µ—Å—Å–∏—è: {predicted_profession}")

–ü–æ–¥—Ö–æ–¥—è—â–∞—è –ø—Ä–æ—Ñ–µ—Å—Å–∏—è: –§—Ä–æ–Ω—Ç–µ–Ω–¥-—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫


In [13]:
def smart_predict(skills_text, top1_threshold=0.3, min_prob_threshold=0.027):
    # –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è
    X_vec = vectorizer.transform([skills_text])
    
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
    proba = model.predict_proba(X_vec)[0]
    classes = model.classes_
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ —É–±—ã–≤–∞–Ω–∏—é
    top_indices = proba.argsort()[::-1]
    
    top1_prob = proba[top_indices[0]]
    
    if top1_prob >= top1_threshold:
        # –ï—Å–ª–∏ –º–æ–¥–µ–ª—å –æ—á–µ–Ω—å —É–≤–µ—Ä–µ–Ω–∞ ‚Äî –æ—Ç–¥–∞–µ–º —Ç–æ–ª—å–∫–æ –æ–¥–Ω—É –ø—Ä–æ—Ñ–µ—Å—Å–∏—é
        result = [(classes[top_indices[0]], round(top1_prob * 100, 2))]
    else:
        # –ò–Ω–∞—á–µ –æ—Ç–¥–∞–µ–º –≤—Å–µ –ø—Ä–æ—Ñ–µ—Å—Å–∏–∏ —Å –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é –≤—ã—à–µ –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–≥–æ –ø–æ—Ä–æ–≥–∞
        result = []
        for idx in top_indices:
            if proba[idx] >= min_prob_threshold:
                result.append((classes[idx], round(proba[idx] * 100, 2)))
                
    if not result:
        result.append((classes[top_indices[0]], round(proba[top_indices[0]] * 100, 2)))
        result.append((classes[top_indices[1]], round(proba[top_indices[0]] * 100, 2)))
        result.append((classes[top_indices[2]], round(proba[top_indices[0]] * 100, 2)))
    
    return result

# ----- –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è -----

# –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π –≤–≤–æ–¥ –Ω–∞–≤—ã–∫–æ–≤
user_skills = "Python, –ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ, –ê–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö, –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞, –†–∞–±–æ—Ç–∞ –≤ –∫–æ–º–∞–Ω–¥–µ, –ö—Ä–µ–∞—Ç–∏–≤–Ω–æ—Å—Ç—å"

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
predictions = smart_predict(user_skills)

print("\nüéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:")
for profession, probability in predictions:
    print(f"- {profession} ({probability}%)")


üéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:
- –ò–Ω–∂–µ–Ω–µ—Ä –º–∞—à–∏–Ω–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è (7.38%)
- Data Scientist (3.67%)
- –ü—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –º–µ–Ω–µ–¥–∂–µ—Ä (IT) (3.14%)


In [14]:
user_skills = "Python, –ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ, Pandas, –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞,"

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
predictions = smart_predict(user_skills)

print("\nüéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:")
for profession, probability in predictions:
    print(f"- {profession} ({probability}%)")


üéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:
- –ò–Ω–∂–µ–Ω–µ—Ä –º–∞—à–∏–Ω–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è (6.35%)
- Data Scientist (6.08%)
- –ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö (3.57%)


In [15]:
user_skills = "Kotlin, Android Studio, Jetpack Compose"

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
predictions = smart_predict(user_skills)

print("\nüéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:")
for profession, probability in predictions:
    print(f"- {profession} ({probability}%)")


üéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤–≤–µ–¥–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:
- –ú–æ–±–∏–ª—å–Ω—ã–π —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ (3.71%)


In [16]:
model