In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!echo '172.19.153.41  nlp-utils' >> /etc/hosts

In [3]:
!pip3 install kubernetes panda

Looking in indexes: https://k8s:****@nlp-utils/repository/pypi/simple


In [4]:
import json
import pandas as pd
from kubernetes import client, config

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [13]:
class RecrawlerState(object):
    def __init__(self):
        config.load_kube_config(config_file='kube_config', context='nlp-crawler')
        self.v1 = client.CoreV1Api()
        
    @staticmethod
    def simplify_pod(pod):
        columns = {'name', 'namespace', 'creation_timestamp'}
        c_columns = {'image', 'name', 'ready', 'state'}

        metadata = pod.metadata.to_dict()
        result = {k: metadata[k] for k in columns if k in metadata}

        status = []
        c_statuses = pod.status.container_statuses
        for c_st in c_statuses:
            dict_item = c_st.to_dict()
            status_item = {k: dict_item[k] for k in c_columns if k in dict_item}

            status.append(status_item)

        result['containers'] = status
        result['status_phrase'] = pod.status.phase

        return result

    def get_pod_logs(self, namespace, pod_name, container):
        pod_logs = self.v1.read_namespaced_pod_log(
            namespace=namespace,
            name=pod_name,
            container=container,
            tail_lines=100,
        ).split('\n')

        return [line for line in pod_logs if line.strip() != '']
    
    def batch(self):
        pods = self.v1.list_namespaced_pod(namespace='recrawler')

        pod_list = {}
        for po in pods.items:
            pod_list[po.metadata.name] = self.simplify_pod(pod=po)

        result = []
        for pod_name in pod_list.keys():
            pod_logs = self.get_pod_logs(namespace='recrawler', pod_name=pod_name, container='default')

            rows = {
                'date': '',
                'category': '',
            }
            for l in sorted(pod_logs, reverse=True):
                try:
                    logs = json.loads(l)
                except:
                    continue

                for col in rows.keys():
                    if col in logs:
                        rows[col] = logs[col]

                if '' not in rows.values():
                    break

            rows['name'] = pod_name
            rows['state'] = pod_list[pod_name]['status_phrase']

            result.append(rows)    
        
        return result

In [25]:
recrawler_state = RecrawlerState().batch()

df = pd.DataFrame(recrawler_state)['name,category,state,date'.split(',')]

df

Unnamed: 0,name,category,state,date
0,daum-culture-2020-0,문화/공연전시,Succeeded,2020-01-02
1,daum-culture-2020-1,문화/책,Succeeded,2020-01-02
2,daum-culture-2020-10,문화/날씨,Succeeded,2020-01-01
3,daum-culture-2020-2,문화/뷰티패션,Succeeded,2020-01-03
4,daum-culture-2020-3,문화/음식맛집,Succeeded,2020-01-08
5,daum-culture-2020-4,문화/건강,Succeeded,2020-01-02
6,daum-culture-2020-5,문화/가정육아,Succeeded,2020-01-07
7,daum-culture-2020-6,문화/여행레저,Succeeded,2020-01-02
8,daum-culture-2020-7,문화/생활정보,Succeeded,2020-01-02
9,daum-culture-2020-8,문화/생활일반,Succeeded,2020-01-01


In [28]:
df[(df['name'].str.find('-terms-') < 0) & (df['name'].str.find('naver-') >= 0) & (df['state'] == 'Running')]

Unnamed: 0,name,category,state,date
115,naver-economy-2020-1,경제/금융,Running,2020-01-06
117,naver-economy-2020-3,경제/산업/재계,Running,2020-06-30
119,naver-economy-2020-5,경제/경제 일반,Running,2020-07-20
153,naver-society-2020-0,사회/사건사고,Running,2020-07-16
154,naver-society-2020-1,사회/교육,Running,2020-08-25
155,naver-society-2020-2,사회/노동,Running,2020-09-12
156,naver-society-2020-3,사회/언론,Running,2020-09-29
157,naver-society-2020-4,사회/환경,Running,2020-09-06
158,naver-society-2020-5,사회/인권복지,Running,2020-08-09
159,naver-society-2020-6,사회/식품의료,Running,2020-07-14
