In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  ##多行显示

plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

pd.set_option('display.max_columns',50)

from scipy import stats
import time
import itertools

## 模型预测的
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

## 数据降维处理的
from sklearn import decomposition

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import StratifiedKFold,KFold

In [2]:
path = "D://学习文件/个人/大三下/4 知识发现与数据挖掘/期末/比赛数据集/"

df_train = pd.read_csv(path+"fake_job_postings_train.csv")
df_test = pd.read_csv(path+"fake_job_postings_test.csv") 

In [3]:
data = pd.concat([df_train,df_test],axis=0)
data.index = range(len(data))

In [4]:
data_id = pd.DataFrame(preprocessing.LabelEncoder().fit_transform(data.job_id),columns = ["id"])
data_id["job_id"] = data["job_id"]
data.job_id = preprocessing.LabelEncoder().fit_transform(data.job_id)

In [5]:
def change_category(X):
    X = X.fillna("")
    title_vc = X.value_counts()
    X_ans = X.map(dict(title_vc))
    return X_ans

In [6]:
data["title_info"] = change_category(data.title)
data.location = change_category(data.location)
data.department = change_category(data.department)
data.salary_range = change_category(data.salary_range.str.extract('(\d+)')[0]).astype(float)

In [7]:
data.employment_type = preprocessing.LabelEncoder().fit_transform(data.employment_type)
data.required_experience = preprocessing.LabelEncoder().fit_transform(data.required_experience)
data.required_education = preprocessing.LabelEncoder().fit_transform(data.required_education)
data.industry = preprocessing.LabelEncoder().fit_transform(data.industry)
data.function = preprocessing.LabelEncoder().fit_transform(data.function)

In [8]:
dummy_list = ["location","department","salary_range","employment_type","required_experience","required_education","industry","function"]
data_part1 = pd.get_dummies(data[dummy_list].astype(str))

In [9]:
data_part2 = data[["telecommuting","has_company_logo","has_questions"]]

In [1]:
def get_feautures(X,threshold,k):
    X_data = X.fillna("")
    print("计算TF-IDF...")
    ans = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=threshold).fit_transform(X_data).toarray()
    columns_count = ans.shape[1]
    
    print("计算SVD...")
    SVD_ans = decomposition.TruncatedSVD(n_components=k).fit_transform(ans)
    X_part1 = pd.DataFrame(SVD_ans)
    X_part1.columns = [X.name+"_SVD"+str(i) for i in range(len(X_part1.columns))]
    
    print("计算NMF...")
    NMF_ans = decomposition.NMF(n_components=30).fit_transform(ans)
    X_part2 = pd.DataFrame(NMF_ans)
    X_part2.columns = [X.name+"_NMF"+str(i) for i in range(len(X_part2.columns))]
    
    print("计算FA...")
    FA_ans = decomposition.FactorAnalysis(n_components=30).fit_transform(ans)
    X_part3 = pd.DataFrame(FA_ans)
    X_part3.columns = [X.name+"_FA"+str(i) for i in range(len(X_part3.columns))]
     
    print("计算LDA...")
    LDA_ans = decomposition.LatentDirichletAllocation(n_components=30,n_jobs=-1).fit_transform(ans)
    X_part4 = pd.DataFrame(LDA_ans)
    X_part4.columns = [X.name+"_LDA"+str(i) for i in range(len(X_part4.columns))]

    X_feauture = pd.concat([X_part1,X_part2,X_part3,X_part4],axis=1)
    return X_feauture,columns_count

In [None]:
ans1,~ = get_feautures(data["title"],threshold=0.001,k=50)
ans2,~ = get_feautures(data["company_profile"],threshold=0.001,k=50)
ans3,~ = get_feautures(data["description"],threshold=0.001,k=50)
ans4,~ = get_feautures(data["requirements"],threshold=0.001,k=50)
ans5,~ = get_feautures(data["benefits"],threshold=0.001,k=50)

In [None]:
title_data = data["title"].fillna("")
D1 = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=0.001).fit_transform(title_data)
company_profile_data = data["company_profile"].fillna("")
D2 = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=0.01).fit_transform(company_profile_data)
description_data = data["description"].fillna("")
D3 = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=0.01).fit_transform(description_data)
requirements_data = data["requirements"].fillna("")
D4 = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=0.001).fit_transform(requirements_data)
benefits_data = data["benefits"].fillna("")
D5 = TfidfVectorizer(strip_accents='unicode',stop_words="english",min_df=0.001).fit_transform(benefits_data)
D = np.hstack((D1.toarray(),D2.toarray(),D3.toarray(),D4.toarray(),D5.toarray()))

df_D = pd.DataFrame(D)
k=50
print("计算SVD...")
SVD_ans = decomposition.TruncatedSVD(n_components=k).fit_transform(df_D)
X_part1 = pd.DataFrame(SVD_ans)
X_part1.columns = ["SVDt"+str(i) for i in range(len(X_part1.columns))]

print("计算NMF...")
NMF_ans = decomposition.NMF(n_components=k).fit_transform(df_D)
X_part2 = pd.DataFrame(NMF_ans)
X_part2.columns = ["NMFt"+str(i) for i in range(len(X_part2.columns))]

print("计算FA...")
FA_ans = decomposition.FactorAnalysis(n_components=k).fit_transform(df_D)
X_part3 = pd.DataFrame(FA_ans)
X_part3.columns = ["FAt"+str(i) for i in range(len(X_part3.columns))]

print("计算LDA...")
LDA_ans = decomposition.LatentDirichletAllocation(n_components=k,n_jobs=-1).fit_transform(df_D)
X_part4 = pd.DataFrame(LDA_ans)
X_part4.columns = ["LDAt"+str(i) for i in range(len(X_part4.columns))]

ans = pd.concat([X_part1,X_part2,X_part3,X_part4],axis=1)
ans

In [None]:
data_target = data["fraudulent"]

In [None]:
data_part3 = data[["location","department","salary_range","employment_type","required_experience","required_education","industry","function"]]

In [None]:
final_data = pd.concat([data_part1,data_part2,data_part3,ans1,ans2,ans3,ans4,ans5,ans,data_target],axis=1)

In [None]:
df_train1 = final_data.iloc[:9998,:-1]
y_train1 = final_data.iloc[:9998,-1]
df_test1 = final_data.iloc[9998:,:-1]
y_test1 = final_data.iloc[9998:,-1]

# 将参数写成字典下形式
model= lgb.LGBMRegressor(
        n_estimators=20000,
        learning_rate=0.01,
        boosting_type= 'gbdt',
        objective = 'binary',
        max_depth = -1,
        num_leaves=31,
        feature_fraction = 0.9,
        bagging_freq = 1,
        bagging_fraction = 0.9,
        metric="auc",
        is_unbalance = True
    )
Score_list = []
sub_ans = np.zeros([1,final_data.iloc[9999:,:-1].shape[0]])
KF=KFold(n_splits=5,shuffle=True)
k=1
for train_index,test_index in KF.split(df_train1):
    X_train,X_test=df_train1.iloc[train_index,:],df_train1.iloc[test_index,:]
    Y_train,Y_test=y_train1[train_index],y_train1[test_index]
    t1 = time.time()
    # 训练
    model.fit(X_train,#参数字典
              Y_train,#训练集       
              eval_set=[(X_test, Y_test)],#验证数据集
              eval_metric = 'auc',
              early_stopping_rounds=1000, #有验证集的话，提前停止的轮数
              verbose=100,#迭代多少次打印
              )
    t2 = time.time()
    print("运行时间：",str(t2-t1))
    
    plt.figure(figsize=(12,6))
    lgb.plot_importance(model, max_num_features=20)
    plt.title("Featurertances")
    plt.show() 
    
    final_ans = model.predict(final_data.iloc[9999:,:-1])
    sub_ans = sub_ans + final_ans/k
    
    Score_list.append(model.best_score_['valid_0']['auc'])

    k+=1   

In [None]:
answer = pd.DataFrame({"job_id":data_id.iloc[9999:,1].values,"pred":sub_ans[0]})
answer.columns = ["job_id","pred"]
print("Final LightGBM Score:   ",np.mean(Score_list))
answer

In [None]:
answer.to_csv(path+'submit_answer.csv',sep=',',index=False)