In [6]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib as mpl

df = pd.read_csv("ai_job_dataset.csv")


In [7]:
print(df.head())

    job_id              job_title  salary_usd salary_currency  \
0  AI00001  AI Research Scientist       90376             USD   
1  AI00002   AI Software Engineer       61895             USD   
2  AI00003          AI Specialist      152626             USD   
3  AI00004           NLP Engineer       80215             USD   
4  AI00005          AI Consultant       54624             EUR   

  experience_level employment_type company_location company_size  \
0               SE              CT            China            M   
1               EN              CT           Canada            M   
2               MI              FL      Switzerland            L   
3               SE              FL            India            M   
4               EN              PT           France            S   

  employee_residence  remote_ratio  \
0              China            50   
1            Ireland           100   
2        South Korea             0   
3              India            50   
4         

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

output_dir = "/mnt/data/final_proje_gorseller"
os.makedirs(output_dir, exist_ok=True)

plt.figure(figsize=(10, 5))
sns.histplot(df['salary_usd'], bins=40, kde=True)
plt.title("Maaş Dağılımı (USD)")
plt.xlabel("Maaş")
plt.ylabel("Frekans")
plt.tight_layout()
salary_path = os.path.join(output_dir, "1_maas_dagilimi.png")
plt.savefig(salary_path)
plt.close()

plt.figure(figsize=(8, 4))
sns.countplot(x='education_required', data=df, order=df['education_required'].value_counts().index)
plt.title("Eğitim Seviyesi Dağılımı")
plt.xlabel("Eğitim")
plt.ylabel("Adet")
plt.tight_layout()
edu_path = os.path.join(output_dir, "2_egitim_dagilimi.png")
plt.savefig(edu_path)
plt.close()

plt.figure(figsize=(8, 4))
sns.countplot(x='experience_level', data=df, order=df['experience_level'].value_counts().index)
plt.title("Deneyim Seviyesi Dağılımı")
plt.xlabel("Seviye")
plt.ylabel("Adet")
plt.tight_layout()
exp_path = os.path.join(output_dir, "3_deneyim_dagilimi.png")
plt.savefig(exp_path)
plt.close()

plt.figure(figsize=(12, 6))
industry_salary = df.groupby('industry')['salary_usd'].mean().sort_values(ascending=False).head(10)
sns.barplot(x=industry_salary.values, y=industry_salary.index, palette='magma')
plt.title("Endüstri Bazında Ortalama Maaş (İlk 10)")
plt.xlabel("Ortalama Maaş (USD)")
plt.ylabel("Endüstri")
plt.tight_layout()
industry_path = os.path.join(output_dir, "4_endustri_maas.png")
plt.savefig(industry_path)
plt.close()

salary_path, edu_path, exp_path, industry_path


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=industry_salary.values, y=industry_salary.index, palette='magma')


('/mnt/data/final_proje_gorseller\\1_maas_dagilimi.png',
 '/mnt/data/final_proje_gorseller\\2_egitim_dagilimi.png',
 '/mnt/data/final_proje_gorseller\\3_deneyim_dagilimi.png',
 '/mnt/data/final_proje_gorseller\\4_endustri_maas.png')

In [10]:


# salary_usd'nin ortalamasını al
salary_mean = df['salary_usd'].mean()

# Yeni hedef değişken: high_salary (1: yüksek maaş, 0: düşük maaş)
df['high_salary'] = df['salary_usd'].apply(lambda x: 1 if x >= salary_mean else 0)

# Sınıf dağılımını görselleştir
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 4))
sns.countplot(x='high_salary', data=df)
plt.title("Yüksek/Düşük Maaş Dağılımı")
plt.xlabel("0 = Düşük Maaş, 1 = Yüksek Maaş")
plt.ylabel("Adet")
plt.tight_layout()

# Görseli kaydet
target_path = "/mnt/data/final_proje_gorseller/5_maas_etiket_dagilimi.png"
plt.savefig(target_path)
plt.close()

# Sınıf dağılımını döndür
df['high_salary'].value_counts(), target_path


(high_salary
 0    8995
 1    6005
 Name: count, dtype: int64,
 '/mnt/data/final_proje_gorseller/5_maas_etiket_dagilimi.png')

In [11]:
from sklearn.preprocessing import LabelEncoder

# Kullanılacak sütunlar
selected_features = [
    'experience_level',
    'employment_type',
    'company_location',
    'employee_residence',
    'remote_ratio',
    'education_required',
    'years_experience',
    'industry',
    'job_description_length',
    'benefits_score'
]

# Model için gerekli veri çerçevesi
df_model = df[selected_features + ['high_salary']].copy()

# Eksik verileri kaldır
df_model.dropna(inplace=True)

# Kategorik sütunları sayısal verilere çevirelim
label_enc_cols = ['experience_level', 'employment_type', 'company_location',
                  'employee_residence', 'education_required', 'industry']

le = LabelEncoder()
for col in label_enc_cols:
    df_model[col] = le.fit_transform(df_model[col])

# Veri şekline bakalım
df_model.head()


Unnamed: 0,experience_level,employment_type,company_location,employee_residence,remote_ratio,education_required,years_experience,industry,job_description_length,benefits_score,high_salary
0,3,0,3,3,50,1,9,0,1076,5.9,0
1,0,0,2,9,100,2,1,9,1268,5.2,0
2,2,1,17,15,0,0,2,2,1974,9.4,1
3,3,1,8,8,50,3,7,1,1345,8.6,0
4,0,3,6,14,100,2,0,9,1989,6.6,0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Özellikler ve hedefi ayır
X = df_model.drop("high_salary", axis=1)
y = df_model["high_salary"]

# Veriyi eğitim ve test olarak ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeli oluştur ve eğit
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Test verisiyle tahmin yap
y_pred = rf_model.predict(X_test)

# Başarı skorlarını hesapla
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

# Confusion matrix görselini çiz ve kaydet
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Düşük", "Yüksek"], yticklabels=["Düşük", "Yüksek"])
plt.title("Confusion Matrix")
plt.xlabel("Tahmin Edilen")
plt.ylabel("Gerçek Değer")
conf_matrix_path = "/mnt/data/final_proje_gorseller/6_confusion_matrix.png"
plt.tight_layout()
plt.savefig(conf_matrix_path)
plt.close()

accuracy, report['0'], report['1'], conf_matrix_path


(0.9036666666666666,
 {'precision': 0.9166203446359088,
  'recall': 0.9222595078299777,
  'f1-score': 0.919431279620853,
  'support': 1788.0},
 {'precision': 0.884263114071607,
  'recall': 0.8762376237623762,
  'f1-score': 0.8802320762536262,
  'support': 1212.0},
 '/mnt/data/final_proje_gorseller/6_confusion_matrix.png')

In [None]:


# Özelliklerin önem derecelerini al
feature_importances = rf_model.feature_importances_
feature_names = X.columns

# DataFrame olarak sırala
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Görselleştir ve kaydet
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title("Modelin En Çok Dikkate Aldığı Özellikler")
plt.xlabel("Önem Düzeyi")
plt.ylabel("Özellik")
plt.tight_layout()
importance_path = "/mnt/data/final_proje_gorseller/7_feature_importance.png"
plt.savefig(importance_path)
plt.close()

importance_df, importance_path



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')


(                  Feature  Importance
 6        years_experience    0.383275
 0        experience_level    0.173730
 2        company_location    0.131528
 3      employee_residence    0.072778
 8  job_description_length    0.071768
 9          benefits_score    0.058691
 7                industry    0.044110
 1         employment_type    0.023262
 5      education_required    0.023087
 4            remote_ratio    0.017771,
 '/mnt/data/final_proje_gorseller/7_feature_importance.png')