In [1]:
import PyPDF2
import glob 
from tqdm import tqdm_notebook
import numpy as np

def extract_feature_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = '\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])
    feat = [
        len(reader.pages),
        len(text),
        np.mean([len(x) for x in text.split('\n')]),
        np.max([len(x) for x in text.split('\n')]),
        np.std([len(x) for x in text.split('\n')]),

        len(set(text)),
        len(text) - len(set(text)),
        len(set(text)) / (len(text) + 1),
        
        len(text.split()),
        len(text.split('\n')),
        text.count('x'),
        text.count('xxx'),
        sum([text.count(x) for x in '0123456789']),
        text.count('@'),
        
        text.lower().count('java'),
        text.lower().count('jvm'),
        text.lower().count('j'),
        text.lower().count('j'),
        text.lower().count('spring'),

    ]
    return feat

train_paths = glob.glob('./校招简历应聘岗位与项目技能匹配检测训练集/*/*.pdf')
train_label = ['正样本' in x for x in train_paths]
train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]


  0%|          | 0/723 [00:00<?, ?it/s]

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [3]:
val_pred = cross_val_predict(
    LGBMClassifier(),
    np.array(train_feat),
    train_label
)

print(
    classification_report(train_label, val_pred, digits=4)
)

[LightGBM] [Info] Number of positive: 206, number of negative: 372
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1533
[LightGBM] [Info] Number of data points in the train set: 578, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356401 -> initscore=-0.591018
[LightGBM] [Info] Start training from score -0.591018
[LightGBM] [Info] Number of positive: 206, number of negative: 372
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1537
[LightGBM] [Info] Number of data points in the train set: 578, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356401 -> initscore=-0.591018
[LightGBM] [Info] Start training from score -0.591018
[LightGBM] [Info] Number of positive: 206, number of negative: 372
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1532
[LightGBM] [Info] Number of data points in the train set: 578, number of 

In [4]:
m = LGBMClassifier().fit(
    np.array(train_feat),
    train_label
)
import joblib
joblib.dump(m, 'lgb.pkl')

[LightGBM] [Info] Number of positive: 258, number of negative: 465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 723, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356846 -> initscore=-0.589078
[LightGBM] [Info] Start training from score -0.589078


['lgb.pkl']

In [5]:
joblib.load('lgb.pkl')

如下为run.py内容

```python
import PyPDF2
import glob 
import numpy as np
import joblib
import pandas as pd
from lightgbm import LGBMClassifier

def extract_feature_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = '\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])
    feat = [
        len(reader.pages),
        len(text),
        np.mean([len(x) for x in text.split('\n')]),
        np.max([len(x) for x in text.split('\n')]),
        np.std([len(x) for x in text.split('\n')]),

        len(set(text)),
        len(text) - len(set(text)),
        len(set(text)) / (len(text) + 1),
        
        len(text.split()),
        len(text.split('\n')),
        text.count('x'),
        text.count('xxx'),
        sum([text.count(x) for x in '0123456789']),
        text.count('@'),
        
        text.lower().count('java'),
        text.lower().count('jvm'),
        text.lower().count('j'),
        text.lower().count('j'),
        text.lower().count('spring'),

    ]
    return feat

test_paths = glob.glob('/work/data/resume-skill-matching-test-set/*.pdf')[:]
test_feat = [extract_feature_from_pdf(x) for x in test_paths]

m = joblib.load('lgb.pkl')

pd.DataFrame({
    'ResumeID': [x.split('/')[-1] for x in test_paths],
    'label': m.predict(test_feat).astype(int)
}).to_csv('/work/output/result.csv', index=None)
```

In [6]:
!tar -cvzf skill-matching.tar.gz skill-matching/

skill-matching/
skill-matching/lgb.pkl
skill-matching/.ipynb_checkpoints/
skill-matching/.ipynb_checkpoints/run-checkpoint.py
skill-matching/run.py


In [7]:
!s3cmd put skill-matching.tar.gz s3://ai-competition/0fs76epw/skill-matching.tar.gz

upload: 'skill-matching.tar.gz' -> 's3://ai-competition/0fs76epw/skill-matching.tar.gz'  [1 of 1]
 130842 of 130842   100% in    0s  1007.57 kB/s  done
