In [39]:
# 0) 라이브러리 불러오기
import pandas as pd
#
import numpy as np

# sklearn(사이크 런) - 러신러닝하는 라이브러리

from sklearn.model_selection import train_test_split
# 분할 - 데이터베이스 중 8:2 or 7:3 으로 분할 후 큰 데이터로 기계학습 후 작은 데이터로 스스로 검사(정확도 판별)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [40]:
#1) 데이터 로드

df = pd.read_csv("/content/ai_job_market.csv")

df.head()

Unnamed: 0,job_id,company_name,industry,job_title,skills_required,experience_level,employment_type,location,salary_range_usd,posted_date,company_size,tools_preferred
0,1,Foster and Sons,Healthcare,Data Analyst,"NumPy, Reinforcement Learning, PyTorch, Scikit...",Mid,Full-time,"Tracybury, AR",92860-109598,2025-08-20,Large,"KDB+, LangChain"
1,2,"Boyd, Myers and Ramirez",Tech,Computer Vision Engineer,"Scikit-learn, CUDA, SQL, Pandas",Senior,Full-time,"Lake Scott, CU",78523-144875,2024-03-22,Large,"FastAPI, KDB+, TensorFlow"
2,3,King Inc,Tech,Quant Researcher,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP",Entry,Full-time,"East Paige, CM",124496-217204,2025-09-18,Large,"BigQuery, PyTorch, Scikit-learn"
3,4,"Cooper, Archer and Lynch",Tech,AI Product Manager,"Scikit-learn, C++, Pandas, LangChain, AWS, R",Mid,Full-time,"Perezview, FI",50908-123743,2024-05-08,Large,"TensorFlow, BigQuery, MLflow"
4,5,Hall LLC,Finance,Data Scientist,"Excel, Keras, SQL, Hugging Face",Senior,Contract,"North Desireeland, NE",98694-135413,2025-02-24,Large,"PyTorch, LangChain"


In [41]:
# 2) 분석에 쓸 열(컬럼)만 선택
#      - 타깃 : salary_range_usd 중 앞 숫자만 사용해 초봉으로 가정
#      - 입력 : industry, job_title, skills_required, experience_level, employment_type, location(국가만 사용), posted_date, company_size => 피쳐(feature)
use_cols = ["salary_range_usd", "industry", "job_title", "skills_required",
             "experience_level", "employment_type", "location", "posted_date", "company_size"]
df = df[use_cols].copy()

In [42]:
df.head()

Unnamed: 0,salary_range_usd,industry,job_title,skills_required,experience_level,employment_type,location,posted_date,company_size
0,92860-109598,Healthcare,Data Analyst,"NumPy, Reinforcement Learning, PyTorch, Scikit...",Mid,Full-time,"Tracybury, AR",2025-08-20,Large
1,78523-144875,Tech,Computer Vision Engineer,"Scikit-learn, CUDA, SQL, Pandas",Senior,Full-time,"Lake Scott, CU",2024-03-22,Large
2,124496-217204,Tech,Quant Researcher,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP",Entry,Full-time,"East Paige, CM",2025-09-18,Large
3,50908-123743,Tech,AI Product Manager,"Scikit-learn, C++, Pandas, LangChain, AWS, R",Mid,Full-time,"Perezview, FI",2024-05-08,Large
4,98694-135413,Finance,Data Scientist,"Excel, Keras, SQL, Hugging Face",Senior,Contract,"North Desireeland, NE",2025-02-24,Large


In [43]:
# 전처리 작업

In [44]:
# 3) 결측치 처리 (현재는 없으므로 패스)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   salary_range_usd  2000 non-null   object
 1   industry          2000 non-null   object
 2   job_title         2000 non-null   object
 3   skills_required   2000 non-null   object
 4   experience_level  2000 non-null   object
 5   employment_type   2000 non-null   object
 6   location          2000 non-null   object
 7   posted_date       2000 non-null   object
 8   company_size      2000 non-null   object
dtypes: object(9)
memory usage: 140.8+ KB


In [46]:
# experience_level : 원-핫 인코딩(가장 쉬운 방법)
# drop_first=True로 더미 변수 함정 피하기(열 하나 줄이기)
experience_level_dum = pd.get_dummies(df["experience_level"], prefix="experience_level", drop_first=True)
df = pd.concat([df.drop(columns=["experience_level"]), experience_level_dum], axis=1)

In [47]:
df

Unnamed: 0,salary_range_usd,industry,job_title,skills_required,employment_type,location,posted_date,company_size,experience_level_Mid,experience_level_Senior
0,92860-109598,Healthcare,Data Analyst,"NumPy, Reinforcement Learning, PyTorch, Scikit...",Full-time,"Tracybury, AR",2025-08-20,Large,True,False
1,78523-144875,Tech,Computer Vision Engineer,"Scikit-learn, CUDA, SQL, Pandas",Full-time,"Lake Scott, CU",2024-03-22,Large,False,True
2,124496-217204,Tech,Quant Researcher,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP",Full-time,"East Paige, CM",2025-09-18,Large,False,False
3,50908-123743,Tech,AI Product Manager,"Scikit-learn, C++, Pandas, LangChain, AWS, R",Full-time,"Perezview, FI",2024-05-08,Large,True,False
4,98694-135413,Finance,Data Scientist,"Excel, Keras, SQL, Hugging Face",Contract,"North Desireeland, NE",2025-02-24,Large,False,True
...,...,...,...,...,...,...,...,...,...,...
1995,90382-110126,Finance,NLP Engineer,"Flask, FastAPI, Power BI",Internship,"Washingtonmouth, SD",2024-04-22,Large,False,True
1996,47848-137195,Automotive,AI Product Manager,"R, Flask, Excel, C++, CUDA, Scikit-learn",Remote,"Joshuafort, ZA",2023-12-02,Large,True,False
1997,134994-180108,Education,Data Analyst,"Hugging Face, Excel, Scikit-learn, R, MLflow",Contract,"West Brittanyburgh, CG",2023-10-29,Large,False,False
1998,62388-82539,Education,Quant Researcher,"AWS, Python, Scikit-learn",Contract,"Anthonyshire, OM",2024-08-10,Large,False,True


In [48]:
# employment_type : 원-핫 인코딩(가장 쉬운 방법)
# drop_first=True로 더미 변수 함정 피하기(열 하나 줄이기)
employment_type_dum = pd.get_dummies(df["employment_type"], prefix="employment_type", drop_first=True)
df = pd.concat([df.drop(columns=["employment_type"]), employment_type_dum], axis=1)

# company_size : 원-핫 인코딩(가장 쉬운 방법)
# drop_first=True로 더미 변수 함정 피하기(열 하나 줄이기)
company_size_dum = pd.get_dummies(df["company_size"], prefix="company_size", drop_first=True)
df = pd.concat([df.drop(columns=["company_size"]), company_size_dum], axis=1)

# drop_first=True로 더미 변수 함정 피하기(열 하나 줄이기)
industry_dum = pd.get_dummies(df["industry"], prefix="industry", drop_first=True)
df = pd.concat([df.drop(columns=["industry"]), industry_dum], axis=1)

# drop_first=True로 더미 변수 함정 피하기(열 하나 줄이기)
job_title_dum = pd.get_dummies(df["job_title"], prefix="job_title", drop_first=True)
df = pd.concat([df.drop(columns=["job_title"]), job_title_dum], axis=1)

In [49]:
df

Unnamed: 0,salary_range_usd,skills_required,location,posted_date,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,...,industry_Healthcare,industry_Retail,industry_Tech,job_title_AI Researcher,job_title_Computer Vision Engineer,job_title_Data Analyst,job_title_Data Scientist,job_title_ML Engineer,job_title_NLP Engineer,job_title_Quant Researcher
0,92860-109598,"NumPy, Reinforcement Learning, PyTorch, Scikit...","Tracybury, AR",2025-08-20,True,False,True,False,False,False,...,True,False,False,False,False,True,False,False,False,False
1,78523-144875,"Scikit-learn, CUDA, SQL, Pandas","Lake Scott, CU",2024-03-22,False,True,True,False,False,False,...,False,False,True,False,True,False,False,False,False,False
2,124496-217204,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP","East Paige, CM",2025-09-18,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,50908-123743,"Scikit-learn, C++, Pandas, LangChain, AWS, R","Perezview, FI",2024-05-08,True,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,98694-135413,"Excel, Keras, SQL, Hugging Face","North Desireeland, NE",2025-02-24,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,90382-110126,"Flask, FastAPI, Power BI","Washingtonmouth, SD",2024-04-22,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
1996,47848-137195,"R, Flask, Excel, C++, CUDA, Scikit-learn","Joshuafort, ZA",2023-12-02,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1997,134994-180108,"Hugging Face, Excel, Scikit-learn, R, MLflow","West Brittanyburgh, CG",2023-10-29,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1998,62388-82539,"AWS, Python, Scikit-learn","Anthonyshire, OM",2024-08-10,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [50]:
# 예시: skills_required가 문자열 형태로 되어 있는 경우
# (이미 df에 이 컬럼이 포함되어 있다고 가정)
# 예: "Python, SQL, Excel"

# 1️⃣ 문자열을 리스트로 변환
df['skills_list'] = df['skills_required'].fillna('').apply(
    lambda x: [s.strip() for s in x.split(',') if s.strip() != '']
)

# 2️⃣ 모든 스킬 집합 추출
all_skills = sorted(set(sum(df['skills_list'], [])))  # 중복 제거 후 정렬

print(f"총 {len(all_skills)}개 스킬 발견")

# 3️⃣ 각 스킬별로 0/1 인코딩
for skill in all_skills:
    df[skill] = df['skills_list'].apply(lambda skills: int(skill in skills))

# 2️⃣ 원본 문자열 컬럼 제거
df = df.drop(columns=['skills_required', 'skills_list'])

# 4️⃣ 결과 확인
print(df[all_skills].head())

총 22개 스킬 발견
   AWS  Azure  C++  CUDA  Excel  FastAPI  Flask  GCP  Hugging Face  Keras  \
0    0      0    0     0      0        1      0    1             0      0   
1    0      0    0     1      0        0      0    0             0      0   
2    0      1    0     0      0        1      0    1             0      0   
3    1      0    1     0      0        0      0    0             0      0   
4    0      0    0     0      1        0      0    0             1      1   

   ...  NumPy  Pandas  Power BI  PyTorch  Python  R  Reinforcement Learning  \
0  ...      1       0         0        1       0  0                       1   
1  ...      0       1         0        0       0  0                       0   
2  ...      0       0         0        1       0  0                       0   
3  ...      0       1         0        0       0  1                       0   
4  ...      0       0         0        0       0  0                       0   

   SQL  Scikit-learn  TensorFlow  
0    0         

In [51]:
df

Unnamed: 0,salary_range_usd,location,posted_date,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,company_size_Startup,...,NumPy,Pandas,Power BI,PyTorch,Python,R,Reinforcement Learning,SQL,Scikit-learn,TensorFlow
0,92860-109598,"Tracybury, AR",2025-08-20,True,False,True,False,False,False,False,...,1,0,0,1,0,0,1,0,1,0
1,78523-144875,"Lake Scott, CU",2024-03-22,False,True,True,False,False,False,False,...,0,1,0,0,0,0,0,1,1,0
2,124496-217204,"East Paige, CM",2025-09-18,False,False,True,False,False,False,False,...,0,0,0,1,0,0,0,1,0,0
3,50908-123743,"Perezview, FI",2024-05-08,True,False,True,False,False,False,False,...,0,1,0,0,0,1,0,0,1,0
4,98694-135413,"North Desireeland, NE",2025-02-24,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,90382-110126,"Washingtonmouth, SD",2024-04-22,False,True,False,True,False,False,False,...,0,0,1,0,0,0,0,0,0,0
1996,47848-137195,"Joshuafort, ZA",2023-12-02,True,False,False,False,True,False,False,...,0,0,0,0,0,1,0,0,1,0
1997,134994-180108,"West Brittanyburgh, CG",2023-10-29,False,False,False,False,False,False,False,...,0,0,0,0,0,1,0,0,1,0
1998,62388-82539,"Anthonyshire, OM",2024-08-10,False,True,False,False,False,False,False,...,0,0,0,0,1,0,0,0,1,0


In [52]:
# 날짜 변환
df['posted_date'] = pd.to_datetime(df['posted_date'], errors='coerce')

# 연도, 월, 일 분리
df['posted_year'] = df['posted_date'].dt.year
df['posted_month'] = df['posted_date'].dt.month

df = df.drop(columns=['posted_date'])

In [53]:
df

Unnamed: 0,salary_range_usd,location,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,company_size_Startup,industry_E-commerce,...,Power BI,PyTorch,Python,R,Reinforcement Learning,SQL,Scikit-learn,TensorFlow,posted_year,posted_month
0,92860-109598,"Tracybury, AR",True,False,True,False,False,False,False,False,...,0,1,0,0,1,0,1,0,2025,8
1,78523-144875,"Lake Scott, CU",False,True,True,False,False,False,False,False,...,0,0,0,0,0,1,1,0,2024,3
2,124496-217204,"East Paige, CM",False,False,True,False,False,False,False,False,...,0,1,0,0,0,1,0,0,2025,9
3,50908-123743,"Perezview, FI",True,False,True,False,False,False,False,False,...,0,0,0,1,0,0,1,0,2024,5
4,98694-135413,"North Desireeland, NE",False,True,False,False,False,False,False,False,...,0,0,0,0,0,1,0,0,2025,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,90382-110126,"Washingtonmouth, SD",False,True,False,True,False,False,False,False,...,1,0,0,0,0,0,0,0,2024,4
1996,47848-137195,"Joshuafort, ZA",True,False,False,False,True,False,False,False,...,0,0,0,1,0,0,1,0,2023,12
1997,134994-180108,"West Brittanyburgh, CG",False,False,False,False,False,False,False,False,...,0,0,0,1,0,0,1,0,2023,10
1998,62388-82539,"Anthonyshire, OM",False,True,False,False,False,False,False,False,...,0,0,1,0,0,0,1,0,2024,8


In [54]:
# salary_range_usd에서 '-' 앞부분만 추출
df['salary_range_usd'] = df['salary_range_usd'].astype(str)

df['salary_range_usd'] = df['salary_range_usd'].apply(
    lambda x: x.split('-')[0].strip() if '-' in x else x.strip()
)

# 숫자로 변환 (선택사항)
df['salary_range_usd'] = pd.to_numeric(df['salary_range_usd'], errors='coerce')

# 결과 확인
print(df[['salary_range_usd']].head(10))

   salary_range_usd
0             92860
1             78523
2            124496
3             50908
4             98694
5             92632
6             70575
7             63032
8            134239
9             79361


In [55]:
df

Unnamed: 0,salary_range_usd,location,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,company_size_Startup,industry_E-commerce,...,Power BI,PyTorch,Python,R,Reinforcement Learning,SQL,Scikit-learn,TensorFlow,posted_year,posted_month
0,92860,"Tracybury, AR",True,False,True,False,False,False,False,False,...,0,1,0,0,1,0,1,0,2025,8
1,78523,"Lake Scott, CU",False,True,True,False,False,False,False,False,...,0,0,0,0,0,1,1,0,2024,3
2,124496,"East Paige, CM",False,False,True,False,False,False,False,False,...,0,1,0,0,0,1,0,0,2025,9
3,50908,"Perezview, FI",True,False,True,False,False,False,False,False,...,0,0,0,1,0,0,1,0,2024,5
4,98694,"North Desireeland, NE",False,True,False,False,False,False,False,False,...,0,0,0,0,0,1,0,0,2025,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,90382,"Washingtonmouth, SD",False,True,False,True,False,False,False,False,...,1,0,0,0,0,0,0,0,2024,4
1996,47848,"Joshuafort, ZA",True,False,False,False,True,False,False,False,...,0,0,0,1,0,0,1,0,2023,12
1997,134994,"West Brittanyburgh, CG",False,False,False,False,False,False,False,False,...,0,0,0,1,0,0,1,0,2023,10
1998,62388,"Anthonyshire, OM",False,True,False,False,False,False,False,False,...,0,0,1,0,0,0,1,0,2024,8


In [60]:
# location 컬럼에서 마지막 부분(국가 코드) 추출
df['country_code'] = df['location'].astype(str).apply(
    lambda x: x.split(',')[-1].strip() if ',' in x else np.nan
)

print(df[['location', 'country_code']].head(10))

df = pd.get_dummies(df, columns=['country_code'], prefix='country')
df = df.drop(columns=['location'])

                location country_code
0          Tracybury, AR           AR
1         Lake Scott, CU           CU
2         East Paige, CM           CM
3          Perezview, FI           FI
4  North Desireeland, NE           NE
5        South Kevin, TZ           TZ
6         West Shawn, LR           LR
7        Port Hailey, RU           RU
8        Butlermouth, GB           GB
9        Nicoleshire, BA           BA


In [61]:
df

Unnamed: 0,salary_range_usd,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,company_size_Startup,industry_E-commerce,industry_Education,...,country_VA,country_VC,country_VE,country_VN,country_VU,country_WS,country_YE,country_ZA,country_ZM,country_ZW
0,92860,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,78523,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,124496,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,50908,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,98694,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,90382,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1996,47848,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1997,134994,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1998,62388,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [64]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 630 entries, salary_range_usd to country_ZW
dtypes: bool(605), int32(2), int64(23)
memory usage: 1.5 MB


Unnamed: 0,salary_range_usd,experience_level_Mid,experience_level_Senior,employment_type_Full-time,employment_type_Internship,employment_type_Remote,company_size_Mid,company_size_Startup,industry_E-commerce,industry_Education,...,country_VA,country_VC,country_VE,country_VN,country_VU,country_WS,country_YE,country_ZA,country_ZM,country_ZW
0,92860,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,78523,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,124496,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,50908,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,98694,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [63]:
# 현재 df를 CSV 파일로 저장
df.to_csv("ai_job_market_preprocessed.csv", index=False, encoding="utf-8-sig")


In [65]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 타깃 및 피처 분리
X = df.drop(columns=['salary_range_usd'])
y = np.log1p(df['salary_range_usd'])  # 로그 변환 안정화

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# 예측
y_pred = rf.predict(X_test)

# 평가
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")


RMSE: 0.379
R²: -0.027
