# Import

In [29]:
import re
from pprint import pprint
from collections import defaultdict
# from urllib.parse import urlsplit

import pandas as pd

from db.mongo import MyMongo

# DB Fetch

In [30]:
# Original Data: from scraper
with MyMongo() as db:
    cursor_in_page = db.find('public_website', 'pages')

with MyMongo() as db:
    js_files = db.find('public_website', 'a5_js_code_external')

<--Mongo Connected.
Mongo Connection Closed.-->


# Functions

In [76]:
def remove_comment(script):
    script = re.sub(r'(\/\/.*\n)', '', script)
    script = re.sub(r'(\/\*[\s\S]*?\*\/)', '', script)
    return script


def get_comment(script):
    one_line = re.findall(r'(\/\/.*\n)', script)
    multi_line = re.findall(r'(\/\*[\s\S]*?\*\/)', script)

    # return string
    return '\n'.join(one_line) + '\n' + '\n'.join(multi_line)


def get_active_x(script):
    return re.findall(r'ActiveXObject\((.*?)\)', script)


def get_exe(script):
    return re.findall(r'\/(\S*\.exe)[^c]', script)
#     return re.findall(r'\/(.*\.exe)[^c]', script)


def extend_active_x(ax_list, script):
    new_list = get_active_x(script)
    if new_list:
        ax_list.extend(new_list)


def extend_exe(exe_list, script):
    new_list = get_exe(script)
    if new_list:
        exe_list.extend(new_list)

        
def cursor2dict_in_page(cur):
    result = defaultdict(dict)

    for u in cur:
        ax_cmt = []
        ax_srt = []
        exe_cmt = []
        exe_srt = []
        for script in u['jsScript']:
            cmt = get_comment(script)
            extend_active_x(ax_cmt, cmt)
            extend_exe(exe_cmt, cmt)

            srt = remove_comment(script)
            extend_active_x(ax_srt, srt)
            extend_exe(exe_srt, srt)

        result[u['url']]['ax_cmt'] = ax_cmt
        result[u['url']]['ax_srt'] = ax_srt
        result[u['url']]['exe_cmt'] = exe_cmt
        result[u['url']]['exe_srt'] = exe_srt
    
    return result


def cursor2dict_external(cur):
    result = defaultdict(dict)

    for u in cur:
        ax_cmt = []
        ax_srt = []
        exe_cmt = []
        exe_srt = []
        
        script = u['jsSource']

        cmt = get_comment(script)
        extend_active_x(ax_cmt, cmt)
        extend_exe(exe_cmt, cmt)

        srt = remove_comment(script)
        extend_active_x(ax_srt, srt)
        extend_exe(exe_srt, srt)

        result[u['webPath']]['ax_cmt'] = ax_cmt
        result[u['webPath']]['ax_srt'] = ax_srt
        result[u['webPath']]['exe_cmt'] = exe_cmt
        result[u['webPath']]['exe_srt'] = exe_srt
    
    return result


def dict2df(d):
    df_0 = []
    df_1 = []
    df_2 = []
    df_3 = []

    for url, script in d.items():
        for name in script['ax_cmt']:
            df_0.append((url, name.strip('\'\"')))
        for name in script['ax_srt']:
            df_1.append((url, name.strip('\'\"')))
        for name in script['exe_cmt']:
            remove_garbage = name.split('/')[-1]
            df_2.append((url, remove_garbage.strip('\'\"')))
        for name in script['exe_srt']:
            remove_garbage = name.split('/')[-1]
            df_3.append((url, remove_garbage.strip('\'\"')))
    df_ax_cmt = pd.DataFrame(columns=['url', 'ax_cmt'], data=df_0)
    df_ax_srt = pd.DataFrame(columns=['url', 'ax_srt'], data=df_1)
    df_exe_cmt = pd.DataFrame(columns=['url', 'exe_cmt'], data=df_2)
    df_exe_srt = pd.DataFrame(columns=['url', 'exe_srt'], data=df_3)

    return df_ax_cmt, df_ax_srt, df_exe_cmt, df_exe_srt

# Javascript

## In-Page JS

### Cursor to Dict

In [33]:
in_page_js = cursor2dict_in_page(cursor_in_page)

i = 0
for k, v in in_page_js.items():
    if i == 4:
        break
    i += 1
    print(k, v)

http://lms.khealth.or.kr {'ax_cmt': [], 'ax_srt': [], 'exe_cmt': [], 'exe_srt': []}
http://kmtp.medicalkorea.or.kr {'ax_cmt': [], 'ax_srt': [], 'exe_cmt': [], 'exe_srt': []}
http://www.khealth.or.kr/healthplan {'ax_cmt': [], 'ax_srt': [], 'exe_cmt': [], 'exe_srt': []}
http://haspa.khidi.or.kr/ {'ax_cmt': [], 'ax_srt': [], 'exe_cmt': [], 'exe_srt': []}


### Dict to DF

In [54]:
df_ax_cmt, df_ax_srt, df_exe_cmt, df_exe_srt = dict2df(in_page_js)

### Merge to df

In [55]:
df_in_page = pd.merge(df_ax_cmt, df_ax_srt, how='outer', on='url').fillna(''); df_in_page
df_in_page = df_in_page.merge(df_exe_cmt, how='outer', on='url').fillna(''); df_in_page
df_in_page = df_in_page.merge(df_exe_srt, how='outer', on='url').fillna(''); df_in_page

Unnamed: 0,url,ax_cmt,ax_srt,exe_cmt,exe_srt
0,https://nosmk.khealth.or.kr/nsk/ntcc/index.do,Msxml2.XMLHTTP,,,
1,http://ncpms.rda.go.kr/npms/Main.np,Scripting.FileSystemObject,Scripting.FileSystemObject,,
2,http://ncpms.rda.go.kr/npms/MberTyChoiceR.np,Scripting.FileSystemObject,Scripting.FileSystemObject,,
3,https://ncpms.rda.go.kr/npms/LoginM.np,Scripting.FileSystemObject,Scripting.FileSystemObject,,
4,http://yestv.or.kr/,Microsoft.XMLHTTP,Microsoft.XMLDOM,,
5,https://www.swit.or.kr/IS/web/index.jsp,AxCrossCert.AxCrossCert,,,
6,http://www.fris.go.kr/install.html,Msxml2.DOMDocument.4.0,,,
7,https://www.gokams.or.kr:442/07_member/join01_...,Msxml2.XMLHTTP,Msxml2.XMLHTTP,,
8,https://www.gokams.or.kr:442/07_member/join01_...,Msxml2.XMLHTTP,Microsoft.XMLHTTP,,
9,https://www.gokams.or.kr:442/07_member/join01_...,Microsoft.XMLHTTP,Msxml2.XMLHTTP,,


---

## External JS File

### Cursor to Dict

In [77]:
external_js = cursor2dict_external(js_files)

### Dict to DF

In [78]:
df_ax_cmt, df_ax_srt, df_exe_cmt, df_exe_srt = dict2df(external_js)

### Merge to DF

In [84]:
df_external = pd.merge(df_ax_cmt, df_ax_srt, how='outer', on='url').fillna(''); df_external
df_external = df_external.merge(df_exe_cmt, how='outer', on='url').fillna(''); df_external
df_external = df_external.merge(df_exe_srt, how='outer', on='url').fillna(''); df_external

Unnamed: 0,url,ax_cmt,ax_srt,exe_cmt,exe_srt
0,http://eco.gwangju.go.kr/js/jssor.js,arrActiveX[i],,,
1,http://eco.gwangju.go.kr/js/jssor.js,Microsoft.XMLDOM,,,
2,http://www.win-win.or.kr/web/inc/js/shadowbox.js,K,,,
3,http://www.ndti.go.kr/Scripts/AC_RunActiveCont...,strFoo,ShockwaveFlash.ShockwaveFlash.7,,
4,http://www.ndti.go.kr/Scripts/AC_RunActiveCont...,strFoo,ShockwaveFlash.ShockwaveFlash.6,,
5,http://www.ndti.go.kr/Scripts/AC_RunActiveCont...,strFoo,ShockwaveFlash.ShockwaveFlash.3,,
6,http://www.ndti.go.kr/Scripts/AC_RunActiveCont...,strFoo,ShockwaveFlash.ShockwaveFlash.3,,
7,http://www.ndti.go.kr/Scripts/AC_RunActiveCont...,strFoo,ShockwaveFlash.ShockwaveFlash,,
8,http://edu.kcga.go.kr/alditor/alditor.js,Msxml2.XMLHTTP,,,
9,http://edu.kcga.go.kr/alditor/alditor.js,Microsoft.XMLHTTP,,,


## Remove unnecessary js files

In [86]:
# js_files.head()
len(js_files)

TypeError: object of type 'Cursor' has no len()

In [42]:
def remove_js_file_contain_keyword(df, keyword):
    return df[~df['jsFile'].str.contains(keyword)]

In [43]:
js_files_1 = remove_js_file_contain_keyword(js_files, 'jquery')  # 94142

In [44]:
js_files_1 = remove_js_file_contain_keyword(js_files_1, '.min.') # 89416
print(len(js_files_1))

89416


In [45]:
js_files_1 = remove_js_file_contain_keyword(js_files_1, 'css/') # 88945
print(len(js_files_1))

88945


In [46]:
js_files_1 = remove_js_file_contain_keyword(js_files_1, 'google') # 88307
print(len(js_files_1))

88307


In [47]:
js_files_1 = remove_js_file_contain_keyword(js_files_1, 'naver') # 87705
print(len(js_files_1))

87705


In [48]:
js_files_2 = js_files_1.drop(columns=['_id']); js_files_2

Unnamed: 0,jsFile,netLoc
1,/resources/js/common.js,www.khealth.or.kr
2,/MagicLine4NP/magicline4np/js/magicline.js?v=2...,www.khealth.or.kr
4,/resources/js/common.ccis.js,www.khealth.or.kr
5,/resources/component/crosseditor/js/namo_scrip...,www.khealth.or.kr
7,/resources/js/plani.scroller_slide.js,www.khealth.or.kr
12,/resources/js/plani.popup_zone.js,www.khealth.or.kr
14,https://use.fontawesome.com/2d530f5474.js,haspa.khidi.or.kr
22,/js/khidi.js,haspa.khidi.or.kr
24,js/global5152.js?ver=1.0,matchup.kr
27,js/navigation5152.js?ver=1.0,matchup.kr


In [49]:
js_files_2

Unnamed: 0,jsFile,netLoc
1,/resources/js/common.js,www.khealth.or.kr
2,/MagicLine4NP/magicline4np/js/magicline.js?v=2...,www.khealth.or.kr
4,/resources/js/common.ccis.js,www.khealth.or.kr
5,/resources/component/crosseditor/js/namo_scrip...,www.khealth.or.kr
7,/resources/js/plani.scroller_slide.js,www.khealth.or.kr
12,/resources/js/plani.popup_zone.js,www.khealth.or.kr
14,https://use.fontawesome.com/2d530f5474.js,haspa.khidi.or.kr
22,/js/khidi.js,haspa.khidi.or.kr
24,js/global5152.js?ver=1.0,matchup.kr
27,js/navigation5152.js?ver=1.0,matchup.kr


In [51]:
# keywords = ['wizvera', 'anysign4pc', 'touchen', 'ipinside', 'softcamp', 'nprotect', 'astx', 'ksign', 'initech']
# result = defaultdict(list)

# for item in js:
#     for keyword in keywords:
#         for file in item['jsFile']:
#             if keyword in file.lower():
#                 result[keyword].append(item['url'])


# DB Insert

In [None]:
# without_js_script = []

# for item in url_script:
#     item.pop('jsScript')
#     without_js_script.append(item)

# with MyMongo() as db:
#     db.delete_and_insert('public_website', 'website_login_without_script', without_js_script)



# with MyMongo() as db:
#     db.delete_and_insert_df('public_website', 'ax_exe_in_page', df_in_page)

# with MyMongo() as db:
#     db.delete_and_insert_df('public_website', 'ax_exe_external', df_external)

# with MyMongo() as db:
#     db.delete_and_insert_df('public_website', 'js_files_proccessed', js_files_2)
