In [54]:
import PyPDF2
import glob 
from tqdm import tqdm_notebook
import numpy as np

def extract_feature_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = '\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])
    feat = [
        len(reader.pages),
        len(text),
        np.mean([len(x) for x in text.split('\n')]),
        np.max([len(x) for x in text.split('\n')]),
        np.std([len(x) for x in text.split('\n')]),

        len(set(text)),
        len(text) - len(set(text)),
        len(set(text)) / (len(text) + 1),
        
        len(text.split()),
        len(text.split('\n')),
        text.count('x'),
        text.count('xxx'),
        sum([text.count(x) for x in '0123456789']),
        text.count('@'),
    ]
    return feat

train_paths = glob.glob('./校招简历信息完整性检测训练集/*/*.pdf')
train_label = ['正样本' in x for x in train_paths]
train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]


  0%|          | 0/815 [00:00<?, ?it/s]

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [58]:
val_pred = cross_val_predict(
    LGBMClassifier(),
    np.array(train_feat),
    train_label
)

print(
    classification_report(train_label, val_pred, digits=4)
)

[LightGBM] [Info] Number of positive: 559, number of negative: 93
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1658
[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.857362 -> initscore=1.793550
[LightGBM] [Info] Start training from score 1.793550
[LightGBM] [Info] Number of positive: 559, number of negative: 93
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.857362 -> initscore=1.793550
[LightGBM] [Info] Start training from score 1.793550
[LightGBM] [Info] Number of positive: 558, number of negative: 94
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1664
[LightGBM] [Info] Number of data points in the train set: 652, number of used fe

In [63]:
m = LGBMClassifier().fit(
    np.array(train_feat),
    train_label
)
import joblib
joblib.dump(m, 'lgb.pkl')

[LightGBM] [Info] Number of positive: 698, number of negative: 117
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1912
[LightGBM] [Info] Number of data points in the train set: 815, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.856442 -> initscore=1.786045
[LightGBM] [Info] Start training from score 1.786045


['lgb.pkl']

In [64]:
joblib.load('lgb.pkl')

如下为run.py内容

```python
import PyPDF2
import glob 
import numpy as np
import joblib
import pandas as pd
from lightgbm import LGBMClassifier

def extract_feature_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = '\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])
    feat = [
        len(reader.pages),
        len(text),
        np.mean([len(x) for x in text.split('\n')]),
        np.max([len(x) for x in text.split('\n')]),
        np.std([len(x) for x in text.split('\n')]),

        len(set(text)),
        len(text) - len(set(text)),
        len(set(text)) / (len(text) + 1),
        
        len(text.split()),
        len(text.split('\n')),
        text.count('x'),
        text.count('xxx'),
        sum([text.count(x) for x in '0123456789']),
        text.count('@'),
    ]
    return feat

test_paths = glob.glob('/work/data/integrity-check-of-resume-test-set/*.pdf')[:]
test_feat = [extract_feature_from_pdf(x) for x in test_paths]

m = joblib.load('lgb.pkl')

pd.DataFrame({
    'ResumeID': [x.split('/')[-1] for x in test_paths],
    'label': m.predict(test_feat).astype(int)
}).to_csv('/work/output/result.csv', index=None)
```

In [65]:
!tar -cvzf information-integrity.tar.gz information-integrity/

information-integrity/
information-integrity/lgb.pkl
information-integrity/.ipynb_checkpoints/
information-integrity/.ipynb_checkpoints/run-checkpoint.py
information-integrity/run.py


In [66]:
!s3cmd put information-integrity.tar.gz s3://ai-competition/0fs76epw/information-integrity.tar.gz

upload: 'information-integrity.tar.gz' -> 's3://ai-competition/0fs76epw/information-integrity.tar.gz'  [1 of 1]
 134511 of 134511   100% in    0s  1074.60 kB/s  done
