#### All Functions

In [22]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json
import pycountry 
import re
import urllib
from tqdm.notebook import tqdm

In [23]:
def url_to_df(urls, clean_empty = False , attrs = {}, helper = None):
    '''
    Input:
        urls : url from which data is taken.
        clean_empty : remove empty column.
        attrs : html attr. dict tag while more then on table e.g. {'class' : 'class_name'}.
        helper : helper function for cleaning df.
    Output:
        df : dataframe
    '''
    df_l = []
    for url in urls:
        url_content = requests.get(url).text
        soup = bs(url_content, "lxml")
        table = str(soup.find("table", attrs=attrs))
        df = pd.read_html(str(table))[0]
    
        if clean_empty :
            df = df.loc[:, ~df.isnull().all(axis = 0)]
        
        if helper:
            df = helper(df) 
        df_l.append(df)
    return pd.concat(df_l,ignore_index=True)

In [24]:
def loc_dict(series):
    '''
    input:
        series: series of unique ip address.
    output:
        res: dict containing country code, latitude, longitude.
    '''
    def ip_loc(x):
        '''
        input:
            x : ip address
        output:
            dictionary which contain country,latitude and longitude.
        '''
        res = {}
        url = "https://geolocation-db.com/jsonp/"+x
        with urllib.request.urlopen(url) as url:
            data = json.loads(url.read().decode().split("(")[1].strip(")"))
        res = {"country_code":data["country_code"], 'latitude':data['latitude'],'longitude':data['longitude']}
        try:
            res['alpha_3'] = pycountry.countries.get(alpha_2=country).alpha_3
        except:
            res['alpha_3'] = 'Not found'
        return res
    
    result = {}
    for ip in tqdm(series):
        result[ip] = ip_loc(ip)
    return result

In [25]:
def log_parser_regex(strs):
    '''
    input:
        strs : log string.
    output:
        return a dictionary which contain all element of log string.
    '''
    finder = [r'(?P<ip>\A\w+[.]\w+[.]+\w+[.]+\w+)',
              r'(?P<RFC931>\S+)',
              r'(?P<user>\S+)',
              r'\[(?P<date>\d{2}/[a-zA-Z]{3}/\d{4}:\d{2}:\d{2}:\d{2}) (?P<gmt>[+-]\d{4})]',
              r'"(?P<action>.*) HTTP/\d*\.*\d*"',
              r'(?P<status>[0-9]*)',
              r'(?P<size>\S*)',
              r'"(?P<referrer>.*)" "(?P<browser>.*)"']
    m = re.search(' '.join(finder),strs)
    return m.groupdict()

In [26]:
def log_df(df,col_name ,columns = [] ):
    '''
    input: 
        df : dataframe
        col_name : column name of dataframe on which we need to apply function.
        columns : column name for new dataframe.
    output:
        return new dataframe.
    '''
    def log_parser(strs):
        '''
        input:
            str_ : log string.
        output:
            return a dictionary which contain all element of log string.
        '''
        find = {}
        find['IP'] = strs.split()[0]
        find['RFC931'] = strs.split()[1]
        find['USER'] = strs.split()[2]
        find['Date'] = strs.split('[')[1].split()[0]
        find['GMT'] = strs.split('[')[1].split()[1].strip(']')
        try:
            if 'HTTP' in strs.split('"')[1].split()[-1]:
                find['action'] =  strs.split('"')[1].replace(strs.split('"')[1].split()[2],'').strip()
            else:
                find['action'] =  strs.split('"')[1].strip()
        except:
            find['action'] = '-'
        try:
            find['status'] = strs.split('"')[2].strip().split()[0]
        except:
            find['status'] = '-'
        try:
            find['size'] = strs.split('"')[2].strip().split()[1]
        except:
            find['size'] = '-'
        try:
            find['referrer'] = strs.strip().split('"')[3]
        except:
            find['referrer'] ='-'
        try:
            find['browser'] = strs.strip().split('"')[5]
        except:
            find['browser'] = '-'
        
        return find
    df = pd.DataFrame(list(df[col_name].apply(log_parser).values) )
    if len(df.columns) == len(columns):
        df.columns = columns
    return df

In [27]:
def XSS_finder(strs):
    flag = 0
    l1 = ['<','>','\\','`']
    l2 = ['/',')','(']
    for char in l1:
        aasci_encoding = '%'+hex(ord(char)).replace('0x','')
        if (char in strs) or (aasci_encoding in strs):
            flag = 1
            break
    if flag != 1:
        try:
            req_str = ''.join(str_.split('?')[1:])
            for char in l2:
                aasci_encoding = '%'+hex(ord(char)).replace('0x','')
                if (char in req_str) or (aasci_encoding in req_str):
                    flag = 1
                    break
        except:
            pass
    return bool(flag)

In [28]:
def add_location_data(df,column,keys):
    def loc_dict(series):
        '''
        input:
            series: series of unique ip address.
        output:
            res: dict contain country code, lat, long.
        '''
        def ip_loc(x):
            '''
            input:
                x : ip address
            output:
                dictionary which contain country,latitude and longitude.
            '''
            res = {}
            url = "https://geolocation-db.com/jsonp/"+x
            with urllib.request.urlopen(url) as url:
                data = json.loads(url.read().decode().split("(")[1].strip(")"))
            res = {"country_code":data["country_code"], 'latitude':data['latitude'],'longitude':data['longitude']}
            try:
                res['alpha_3'] = pycountry.countries.get(alpha_2=res["country_code"]).alpha_3
            except:
                res['alpha_3'] = 'Not found'
            return res

        result = {}
        for ip in tqdm(series):
            result[ip] = ip_loc(ip)
        return result
    
    loc_dict = loc_dict_maker(df[column].unique())
    for key in keys:
        df[key] = df[column].apply(lambda x: loc_dict[x][key])
    return df

In [29]:
def alpha3code(column):
    '''
    input:
        column : pandas series.
    output:
        return 3-leter country code
    '''
    CODE=[]
    for country in column:
        try:
            code=pycountry.countries.get(alpha_2=country).alpha_3
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE