## Set Up

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import re

from scipy.stats import skew, chi2_contingency
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.impute import SimpleImputer

from wordsegment import load, segment
load()
import nltk
from nltk.corpus import words, brown
from nltk.stem import WordNetLemmatizer
# import spacy
from textblob import TextBlob

# nltk.download('words')
# nltk.download('brown')

# Get set of English words
english_words = set(words.words())

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

## Tools

In [33]:
def load_data(csv_file_name):
    """
    Loading the dataset with .csv

    Params:
        csv_file_name (string): File name with .csv extension

    Returns:
        main_df (dataframe): The dataframe of dataset being loaded
    """
    main_df = pd.read_csv(csv_file_name)
    main_df.head()
    main_df['tld'] = main_df['host'].str.split('.', n=1).str[1]
    main_df['cctld'] = main_df['tld'].str.split('.', n=1).str[1].fillna('None')
    main_df = main_df.rename(columns = {'domain': 'subdomain'})
    return main_df

In [34]:
def is_ascii_domain(col):
    """
    Checking the ASCII text

    Params:
        col (string): The column name to be checked

    Returns:
        int (int): The int of boolean types, either 1 or 0
    """
    ascii_pattern = re.compile(r'^[a-zA-Z0-9.-]+$')
    return int(bool(ascii_pattern.match(col)))

In [35]:
def legitToNumber(isdga):
    """
    Convert the type dga/legit from isdga col to numeric types

    Params:
        isdga (string): The column name isdga

    Returns:
        int (int): The int of boolean types, either 1 or 0
    """
    if isdga == 'legit':
        return 1
    else: return 0

In [36]:
def OH_Encoding(main_df, colname):
    """
    Encode the categorical data type to numbers

    Params:
        main_df (dataframe): The df having colname
        colname (string): The column to be encoded

    Returns:
        main_df (dataframe) if encoded successfully
        'Error' raised otherwise
    """
    if main_df[colname].dtype == 'object':
        OH_encoder = OneHotEncoder(sparse_output=False)
        encoded = OH_encoder.fit_transform(np.array(main_df[colname]).reshape(-1, 1))
        encoded_features = pd.DataFrame(encoded, columns=OH_encoder.get_feature_names_out([colname]))
        main_df = main_df.join(encoded_features)
        main_df.drop(columns=[colname], inplace=True)
        return main_df
    else:
        return 'Error, column is numeric type'

In [37]:
def split_legit_dga(main_df):
    """
    Split the df to legit and dga

    Params:
        main_df (dataframe): The df to be split

    Returns:
        legit_df, dga_df (dataframe): df after being split
    """

    legit_df = main_df[(main_df['isDGA']=='legit')].reset_index().drop(columns='index')
    dga_df = main_df[(main_df['isDGA']!='legit')].reset_index().drop(columns='index')
    return legit_df, dga_df

In [38]:
def word_segment(df, col):
    """
    Segment the words in text of col

    Params:
        df (dataframe): The df having col
        col (string): The object type column to be word segmented

    Returns:
        df (dataframe): df after being segmented
    """

    df['word_segment'] = df[col].apply(segment)
    return df

In [39]:
def count_true_textblob(df, col):
    """
    Count # of true English words with TextBlob

    Params:
        df (dataframe): The df having col
        col (string): The object type column to be checked

    Returns:
        df (dataframe): df after being checked
    """

    def is_correct(word):
        return TextBlob(word).correct() == word
    
    df['validation_textblob'] = df[col].apply(lambda words: [word for word in words if is_correct(word)])
    
    return df

def count_true_nltk(df, col):
    """
    Count # of true English words with NLTK

    Params:
        df (dataframe): The df having col
        col (string): The object type column to be checked

    Returns:
        df (dataframe): df after being checked
    """
    
    def is_English(word):
        return word in english_words
    df['validation_nltk'] = df[col].apply(lambda words: [word for word in words if is_English(word)])

    return df

In [40]:
def lemmatizer_word(df, col):
    """
    Turning words in a list into their base form

    Params:
        df (dataframe): The df having col
        col (string): The object type column having word list to be lemmatized

    Returns:
        df (dataframe): df after being lemmatized
    """
    lemmatizer = WordNetLemmatizer()

    def lemm(lst_text):
        """" 
        Params:
            lst_text (list): the list of words in a col
        Returns:
            lst_lemm (list): the list of words after being lemmatized
        """
        lst_lemm = {lemmatizer.lemmatize(word) for word in lst_text}
        return list(lst_lemm)
    
    df[col] = df[col].apply(lemm)
    return df

In [41]:
def finalize_text_segment(df, col1, col2):
    """
    Finalizing the text segment column

    Params:
        df (dataframe): The df having col1 and col2
        col1 (string): The object type column 1 having word list to be compared
        col2 (string): The object type column 2 having word list to be compared

    Returns:
        df (dataframe): df after being finalized
    """

    def get_longer_list(row):
        return row[col1] if len(row[col1]) > len(row[col2]) else row[col2]
    
    df['text_segment_final'] = df.apply(get_longer_list, axis=1)
    
    df.drop(columns=[col1, col2], inplace=True)
    
    return df

In [42]:
# def process(csv_file_name):
    main_df = load_data('dga_data_small.csv')

    main_df['ascii'] = main_df['subdomain'].apply(is_ascii_domain)
    main_df['subdomain_len'] = main_df['subdomain'].str.len()
    main_df['host_len'] = main_df['host'].str.len()

    # Vowel count
    main_df['subdomain_vowel_count'] = main_df['subdomain'].str.lower().str.count(r'[aeoiu]')

    # Consonant count
    main_df['subdomain_consonant_count'] = main_df['subdomain_len'] - main_df['subdomain_vowel_count']

    # Has Numeric - boolean to int type
    main_df['has_num'] = main_df['subdomain'].str.contains(r'\d').astype(int)

    # To Number for isDGA col
    main_df['isDGA_N'] = main_df['isDGA'].apply(legitToNumber)
    main_df['digitCount'] = main_df['subdomain'].str.count(r'\d')

    # Check if subdomain starting with digit
    main_df['startW/Digit'] = main_df['subdomain'].str.match(r'^\d').astype(int)

    # OH Encoding subclass
    main_df = OH_Encoding(main_df, 'subclass')
    main_df = OH_Encoding(main_df, 'tld')
    main_df = OH_Encoding(main_df, 'cctld')

    legit_df, dga_df = split_legit_dga(main_df)

    legit_df = word_segment(legit_df, 'subdomain')
    legit_df = count_true_textblob(legit_df, 'word_segment')
    legit_df = count_true_nltk(legit_df, 'word_segment')
    legit_df = lemmatizer_word(legit_df, 'validation_textblob')
    legit_df = lemmatizer_word(legit_df, 'validation_nltk')
    legit_df = finalize_text_segment(legit_df, 'validation_textblob', 'validation_nltk')

    dga_df = word_segment(dga_df, 'subdomain')
    dga_df = count_true_textblob(dga_df, 'word_segment')
    dga_df = count_true_nltk(dga_df, 'word_segment')
    dga_df = lemmatizer_word(dga_df, 'validation_textblob')
    dga_df = lemmatizer_word(dga_df, 'validation_nltk')
    dga_df = finalize_text_segment(dga_df, 'validation_textblob', 'validation_nltk')

    return legit_df, dga_df


In [47]:
main_df['ascii'] = main_df['subdomain'].apply(is_ascii_domain)
main_df['subdomain_len'] = main_df['subdomain'].str.len()
main_df['host_len'] = main_df['host'].str.len()

In [48]:
main_df['subdomain_vowel_count'] = main_df['subdomain'].str.lower().str.count(r'[aeoiu]')

In [50]:
main_df['subdomain_consonant_count'] = main_df['subdomain_len'] - main_df['subdomain_vowel_count']

    # Has Numeric - boolean to int type
main_df['has_num'] = main_df['subdomain'].str.contains(r'\d').astype(int)

    # To Number for isDGA col
main_df['isDGA_N'] = main_df['isDGA'].apply(legitToNumber)
main_df['digitCount'] = main_df['subdomain'].str.count(r'\d')

# Check if subdomain starting with digit
main_df['startW/Digit'] = main_df['subdomain'].str.match(r'^\d').astype(int)

In [51]:
    main_df = OH_Encoding(main_df, 'subclass')
    main_df = OH_Encoding(main_df, 'tld')
    main_df = OH_Encoding(main_df, 'cctld')

In [52]:
    legit_df, dga_df = split_legit_dga(main_df)

In [54]:
    legit_df = word_segment(legit_df, 'subdomain')


In [56]:
    legit_df = count_true_textblob(legit_df, 'word_segment')

In [57]:
    legit_df = count_true_nltk(legit_df, 'word_segment')
    legit_df = lemmatizer_word(legit_df, 'validation_textblob')
    legit_df = lemmatizer_word(legit_df, 'validation_nltk')
    legit_df = finalize_text_segment(legit_df, 'validation_textblob', 'validation_nltk')

In [60]:
    dga_df = word_segment(dga_df, 'subdomain')

In [62]:
dga_df.info()

Unnamed: 0,isDGA,subdomain,host,ascii,subdomain_len,host_len,subdomain_vowel_count,subdomain_consonant_count,has_num,isDGA_N,digitCount,startW/Digit,subclass_alexa,subclass_bamital,subclass_cryptolocker,subclass_gameoverdga,subclass_goz,subclass_legit,subclass_necurs,subclass_newgoz,subclass_nivdort,tld_ac,tld_am,tld_asia,tld_at,tld_az,tld_ba,tld_be,tld_bit,tld_biz,tld_blog.br,tld_blogspot.com,tld_bz,tld_ca,tld_cc,tld_ch,tld_cl,tld_cm,tld_cn,tld_co,tld_co.id,tld_co.il,tld_co.in,tld_co.jp,tld_co.kr,tld_co.uk,tld_com,tld_com.ar,tld_com.au,tld_com.br,tld_com.cn,tld_com.mx,tld_com.my,tld_com.tr,tld_com.tw,tld_cx,tld_cz,tld_de,tld_dk,tld_do,tld_edu,tld_edu.sa,tld_eu,tld_fi,tld_fm,tld_fr,tld_ga,tld_gob.ar,tld_gouv.fr,tld_gov,tld_gov.br,tld_gov.tw,tld_gr,tld_hk,tld_hr,tld_hu,tld_ie,tld_im,tld_in,tld_info,tld_io,tld_ir,tld_is,tld_it,tld_jp,tld_jus.br,tld_ki,tld_kz,tld_la,tld_lt,tld_lv,tld_me,tld_mn,tld_ms,tld_mu,tld_mx,tld_net,tld_net.cn,tld_nf,tld_nl,tld_no,tld_nu,tld_org,tld_org.br,tld_pe,tld_ph,tld_pl,tld_presse.fr,tld_pro,tld_pt,tld_ro,tld_ru,tld_sc,tld_se,tld_sh,tld_so,tld_su,tld_sx,tld_tk,tld_tn,tld_to,tld_tv,tld_tw,tld_ua,tld_ug,tld_us,tld_vn,tld_xxx,cctld_None,cctld_ar,cctld_au,cctld_br,cctld_cn,cctld_com,cctld_fr,cctld_id,cctld_il,cctld_in,cctld_jp,cctld_kr,cctld_mx,cctld_my,cctld_sa,cctld_tr,cctld_tw,cctld_uk,word_segment
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,1,13,19,1,12,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,[tyopcrkqgxcfm]
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,1,26,30,4,22,1,0,10,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[72j5rn1l9mzleo6203v1oge, nfl]"
2,dga,thenrest,thenrest.net,1,8,12,2,6,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[then, rest]"
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,1,26,30,5,21,1,0,7,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[15ihbm71utcnfa8dk1mmgoob, l9]"
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,1,24,28,6,18,1,0,9,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[x1d6ou7e7kofk60ayhq74x7e]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,dga,jlljsxwrfkys,jlljsxwrfkys.ru,1,12,15,0,12,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[jlljsxwrfkys]
996,dga,maudmjvij,maudmjvij.xxx,1,9,13,3,6,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[maud, mj, vij]"
997,dga,lllndsiljokku,lllndsiljokku.ru,1,13,16,3,10,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[lllndsiljokku]
998,dga,septemberfish,septemberfish.net,1,13,17,4,9,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[september, fish]"


In [67]:
def is_correct(word):
    return TextBlob(word).correct() == word

dga_df['validation_textblob'] = dga_df['word_segment'].apply(lambda words: [word for word in words if is_correct(word)])

KeyboardInterrupt: 