# Predict Attack Type using SVM

In [3]:

import pandas as pd
import re

encoding = ['latin1', 'iso8859-1', 'utf-8'][1]
attack_regex = re.compile(r"attack")


### Mike's half-way-done tf-idf analysis

In [None]:

def get_max_column(df):
    max_length = 0
    max_column = None
    for column in df.columns:
        column_length = df[column].astype('str').str.len().mean()
        if column_length > max_length:
            max_length = column_length
            max_column = column
            
    return(max_column, max_length)

In [None]:

word_analysis_path = "../data/csv/Word Analysis"
csv_path =  word_analysis_path + '.csv'
word_analysis_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
word_analysis_df.columns

In [10]:

attack_types_path = "../data/csv/AttackTypes"
csv_path =  attack_types_path + '.csv'
attack_types_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
attack_types_df

Unnamed: 0,Attack Type Id,Attack Type
0,1,Assassination
1,2,Armed Assault
2,3,Bombing/Explosion
3,4,Hijacking
4,5,Hostage Taking (Barricade Incident)
5,6,Hostage Taking (Kidnapping)
6,7,Facility/Infrastructure Attack
7,8,Unarmed Assault
8,9,Unknown


In [10]:

attack_types_df['Attack Type'].tolist().index("Unknown")

8


### Andrew's data

In [3]:

ucdp_path = "../data/csv/UCDP"
csv_path =  ucdp_path + '.csv'
ucdp_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
ucdp_df.columns

Index(['id', 'relid', 'year', 'active_year', 'type_of_violence',
       'conflict_dset_id', 'conflict_new_id', 'conflict_name', 'dyad_dset_id',
       'dyad_new_id', 'dyad_name', 'side_a_dset_id', 'side_a_new_id', 'side_a',
       'side_b_dset_id', 'side_b_new_id', 'side_b', 'number_of_sources',
       'source_article', 'source_office', 'source_date', 'source_headline',
       'source_original', 'where_prec', 'where_coordinates',
       'where_description', 'adm_1', 'adm_2', 'latitude', 'longitude',
       'geom_wkt', 'priogrid_gid', 'country', 'region', 'event_clarity',
       'date_prec', 'date_start', 'date_end', 'deaths_a', 'deaths_b',
       'deaths_civilians', 'deaths_unknown', 'best_est', 'high_est', 'low_est',
       'isocc', 'gwno', 'gwab'],
      dtype='object')

In [None]:

get_max_column(ucdp_df)

In [4]:

scad_path = "../data/csv/SCAD"
csv_path =  scad_path + '.csv'
scad_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
scad_df.columns

Index(['eventid', 'id', 'ccode', 'countryname', 'startdate', 'enddate',
       'duration', 'stday', 'stmo', 'styr', 'eday', 'emo', 'eyr', 'etype',
       'escalation', 'actor1', 'actor2', 'actor3', 'target1', 'target2',
       'cgovtarget', 'rgovtarget', 'npart', 'ndeath', 'repress', 'elocal',
       'ilocal', 'sublocal', 'locnum', 'gislocnum', 'issue1', 'issue2',
       'issue3', 'issuenote', 'nsource', 'notes', 'coder', 'acd_questionable',
       'latitude', 'longitude', 'geo_comments', 'location_precision'],
      dtype='object')

In [None]:

get_max_column(scad_df)

In [5]:

rand_path = "../data/csv/RAND"
csv_path =  rand_path + '.csv'
rand_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
rand_df.columns

Index(['startdate', 'city', 'country', 'perpetrator', 'weapon', 'injuries',
       'fatalities', 'description'],
      dtype='object')

In [None]:

get_max_column(rand_df)

In [6]:

acled_path = "../data/csv/ACLED"
csv_path =  acled_path + '.csv'
acled_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
acled_df.columns

Index(['GWNO', 'EVENT_ID_CNTY', 'EVENT_ID_NO_CNTY', 'EVENT_DATE', 'YEAR',
       'TIME_PRECISION', 'EVENT_TYPE', 'ACTOR1', 'ALLY_ACTOR_1', 'INTER1',
       'ACTOR1_ID', 'ACTOR2', 'ALLY_ACTOR_2', 'INTER2', 'ACTOR2_ID',
       'INTERACTION', 'ACTOR_DYAD_ID', 'COUNTRY', 'ADMIN1', 'ADMIN2', 'ADMIN3',
       'LOCATION', 'LATITUDE', 'LONGITUDE', 'GEO_PRECISION', 'SOURCE', 'NOTES',
       'FATALITIES'],
      dtype='object')

In [None]:

get_max_column(acled_df)


### Andrew's training data

In [4]:

gtdb_path = "../data/csv/GTDB"
csv_path =  gtdb_path + '.csv'
gtdb_df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
attack_column_list = [column for column in gtdb_df.columns for m in [attack_regex.search(column)] if m]
gtdb_df[attack_column_list]['attacktype1_txt'].unique()

array(['Assassination', 'Hostage Taking (Kidnapping)', 'Bombing/Explosion',
       'Facility/Infrastructure Attack', 'Armed Assault', 'Hijacking',
       'Unknown', 'Unarmed Assault', 'Hostage Taking (Barricade Incident)'], dtype=object)

In [None]:

get_max_column(gtdb_df)

In [None]:

gtdb_df.columns

In [8]:

attack_column_list

['attacktype1',
 'attacktype1_txt',
 'attacktype2',
 'attacktype2_txt',
 'attacktype3',
 'attacktype3_txt']


### Get the training data into a 20newsgroups-style bunch

In [6]:

def is_nan(value):
    try:
        float(value)
        return False
    except ValueError:
        return True

def is_ns(value):
    return len(value) > 3

def concat_independendent_variables(df):
    X = pd.Series([])
    for row_index, row_series in df.iterrows():
        row_concat = row_series.astype('str').str.cat(sep=' ')
        row_concat = sq_regex.sub(r'', row_concat)
        row_concat = nonalpha_regex.sub(r' ', row_concat)
        X = X.append(pd.Series([row_concat]), ignore_index=True)
    
    return X

nonalpha_regex = re.compile(r"[^a-zA-Z]+")
sq_regex = re.compile(r"'")

In [7]:

gtdb_df.fillna(value="", inplace=True)

important_columns = [column for column in gtdb_df.columns if (column not in attack_column_list)]
X = concat_independendent_variables(gtdb_df[important_columns])
y = gtdb_df['attacktype1'].map(lambda x: int(x)-1)

In [9]:

X = pd.Series([])
y = pd.Series([])
for csv_file in ['acled', 'rand', 'scad', 'ucdp', 'GTDB']:
    if csv_file == "GTDB":
        gtdb_path = "../data/csv/GTDB"
        csv_path =  gtdb_path + '.csv'
    else:
        mike_path = "../data/csv/mike_"
        csv_path =  mike_path + csv_file + '.csv'
    df = pd.read_csv(csv_path, encoding=encoding, low_memory=False)
    df.fillna(value="", inplace=True)
    if csv_file == "GTDB":
        important_columns = [column for column in df.columns if (column not in attack_column_list)]
    else:
        important_columns = df.columns.tolist()[:-1]
    X = X.append(concat_independendent_variables(df[important_columns]), ignore_index=True)
    if csv_file == "GTDB":
        y = y.append(df['attacktype1'].map(lambda x: int(x)-1), ignore_index=True)
    else:
        y = y.append(df[df.columns.tolist()[-1]].map(lambda x: attack_types_df['Attack Type'].tolist().index(x)), 
                     ignore_index=True)

In [11]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)

In [12]:

from os import listdir
from os.path import isfile, join
import numpy as np

class Bunch(dict):
    """Container object for datasets: dictionary-like object that
       exposes its keys as attributes."""

    def __init__(self, **kwargs):
        dict.__init__(self, kwargs)
        self.__dict__ = self

csv_path = "../data/csv/"
csv_files = [join(csv_path, f) for f in listdir(csv_path) if isfile(join(csv_path, f))]
gtdb_train = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y_train.tolist()),
                   data=X_train.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")
gtdb_test = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y_test.tolist()),
                   data=X_test.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")
gtdb_all = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y.tolist()),
                   data=X.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")

In [13]:

for key in gtdb_train.keys():
    print(key)

description
DESCR
target
target_names
filenames
data


In [14]:

gtdb_train.filenames

array(['../data/csv/ACLED.csv', '../data/csv/acled_df.csv',
       '../data/csv/AttackTypes.csv', '../data/csv/GTDB.csv',
       '../data/csv/gtdb_df.csv', '../data/csv/mike_acled.csv',
       '../data/csv/mike_rand.csv', '../data/csv/mike_scad.csv',
       '../data/csv/mike_ucdp.csv', '../data/csv/RAND.csv',
       '../data/csv/rand_df.csv', '../data/csv/SCAD.csv',
       '../data/csv/scad_df.csv', '../data/csv/UCDP.csv',
       '../data/csv/ucdp_df.csv', '../data/csv/Word Analysis.csv'], 
      dtype='<U29')

In [15]:

gtdb_train.target_names

['Assassination',
 'Armed Assault',
 'Bombing/Explosion',
 'Hijacking',
 'Hostage Taking (Barricade Incident)',
 'Hostage Taking (Kidnapping)',
 'Facility/Infrastructure Attack',
 'Unarmed Assault',
 'Unknown']

In [16]:

gtdb_train.DESCR

In [17]:

gtdb_train.target

array([8, 1, 6, ..., 1, 2, 0])

In [18]:

len(gtdb_all.data)

157465

In [19]:

gtdb_train.description

'The GTDB dataset concatoned into one column (minus the target columns)'

In [20]:

len(gtdb_train.filenames)

16

In [21]:

print("\n".join(gtdb_train.data[0].split("\n")[:3]))

 Peru South America Ayacucho Ayacucho district Police Police Building headquarters station school Police post Peru Shining Path SL Unknown Attacked Unknown PGIS 


In [22]:

print(gtdb_train.target_names[gtdb_train.target[0]])

Unknown


In [23]:

gtdb_train.target[:10]

array([8, 1, 6, 2, 2, 8, 2, 1, 0, 2])

In [24]:

for t in gtdb_train.target[:10]:
    print(gtdb_train.target_names[t])

Unknown
Armed Assault
Facility/Infrastructure Attack
Bombing/Explosion
Bombing/Explosion
Unknown
Bombing/Explosion
Armed Assault
Assassination
Bombing/Explosion



### Vectorize the words

In [25]:

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(gtdb_train.data)
X_train_counts.shape

(105501, 97921)

In [26]:

count_vect.vocabulary_.get(u'hegarty')


### TF-IDF the words

In [27]:

from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(105501, 97921)

In [28]:

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(105501, 97921)


### Try a Naive Bayes model

It is a classification technique based on Bayes' Theorem with an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.

In [29]:

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, gtdb_train.target)

In [30]:

gtdb_test.target[:2]

array([6, 1])

In [31]:

docs_new = gtdb_test.data[:2]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category, actual in zip(docs_new, predicted, gtdb_test.target[:2]):
    print('%r => %s (%s)' % (doc, gtdb_train.target_names[category], gtdb_test.target_names[actual]))

' India South Asia Jharkhand Siladon Assailants set construction equipment on fire in Siladon area Jharkhand state India There were no reported casualties however construction equipment was damaged in the attack No group claimed responsibility for the incident however sources attributed the attack to the Peoples Liberation Front of India Business Construction Unknown Construction Equipment India Peoples Liberation Front of India The specific motive is unknown however sources posited that the attack was part of a bandh by the Peoples Liberation Front of India in demonstration against the death of two civilians Incendiary Arson Fire Minor likely million Three construction machines and a vehicle were damaged in this attack LWE outfit torch four vehicles in Jharkhands Khunti Hindustan Times September PLFI militants torch five machines and a car ahead of bandh in Jharkhand ZeeNews com September START Primary Collection ' => Bombing/Explosion (Facility/Infrastructure Attack)
' Sri Lanka Sout


### Build a Naive Bayes pipeline and get the accuracy

In [32]:

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [33]:

text_clf = text_clf.fit(gtdb_train.data, gtdb_train.target)

In [34]:

import numpy as np

docs_test = gtdb_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == gtdb_test.target)

0.66205449926872451


### Build an SVM pipeline and get various metrics for it

In [35]:

from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(gtdb_train.data, gtdb_train.target)
predicted = text_clf.predict(gtdb_test.data)
np.mean(predicted == gtdb_test.target)

0.80954122084520053

In [36]:

score = text_clf.score(gtdb_train.data, gtdb_train.target)
score

0.80681699699528919

In [37]:

text_clf.decision_function(gtdb_train.data)[0]

array([-1.10413998, -0.71634984, -1.15611   , -1.01508382, -1.00840007,
       -1.12391413, -1.07060169, -1.02702272, -0.90468708])

In [38]:

from sklearn import metrics
print(metrics.classification_report(gtdb_test.target, predicted, target_names=gtdb_test.target_names))

                                     precision    recall  f1-score   support

                      Assassination       0.72      0.38      0.50      5798
                      Armed Assault       0.70      0.86      0.78     12345
                  Bombing/Explosion       0.87      0.99      0.93     25217
                          Hijacking       0.00      0.00      0.00       187
Hostage Taking (Barricade Incident)       0.00      0.00      0.00       273
        Hostage Taking (Kidnapping)       0.89      0.67      0.76      3035
     Facility/Infrastructure Attack       0.78      0.68      0.73      2905
                    Unarmed Assault       0.88      0.12      0.21       370
                            Unknown       0.70      0.03      0.05      1834

                        avg / total       0.80      0.81      0.78     51964



  'precision', 'predicted', average, warn_for)


In [39]:

metrics.confusion_matrix(gtdb_test.target, predicted)

array([[ 2232,  2368,  1080,     0,     0,    60,    47,     0,    11],
       [  411, 10663,   789,     0,     0,    64,   415,     1,     2],
       [   10,    89, 25085,     0,     0,     3,    29,     0,     1],
       [   10,    51,    83,     0,     0,    34,     9,     0,     0],
       [   23,   112,    87,     0,     0,    36,    11,     0,     4],
       [  242,   499,   254,     0,     0,  2019,    19,     0,     2],
       [    6,   375,   535,     0,     0,     8,  1976,     4,     1],
       [   25,    74,   192,     0,     0,    29,     7,    43,     0],
       [  149,   923,   676,     0,     0,    25,    11,     1,    49]])


### Tweek the SVM classifier using grid search

In [40]:

from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [41]:

from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(penalty='l2', n_iter=5, random_state=42)),
])
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__loss': ('log', 'modified_huber'),
}

In [42]:

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [43]:

import time

t0 = time.time()
gs_clf = gs_clf.fit(gtdb_train.data, gtdb_train.target)
t1 = time.time()
print(t1-t0, time.ctime(t1))

437.9772319793701 Tue Jun 27 15:30:13 2017


In [44]:

gtdb_train.target_names[gs_clf.predict([gtdb_test.data[1]])[0]]

'Armed Assault'

In [45]:

gtdb_train.target_names[gtdb_test.target[0]]

'Facility/Infrastructure Attack'

In [46]:

gs_clf.best_score_

0.85632363674277967

In [47]:

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
clf__loss: 'modified_huber'
tfidf__use_idf: False
vect__ngram_range: (1, 2)


In [48]:

predicted = gs_clf.predict(gtdb_test.data)
np.mean(predicted == gtdb_test.target)

0.85836348241089988

In [49]:

t0 = time.time()
probabilities = gs_clf.predict_proba(gtdb_train.data)
t1 = time.time()
print(t1-t0, time.ctime(t1))
probabilities[0]

23.289114475250244 Tue Jun 27 15:31:44 2017


array([ 0.        ,  0.24778289,  0.02127555,  0.        ,  0.        ,
        0.        ,  0.0912228 ,  0.        ,  0.63971876])

In [50]:

probabilities[:2]

array([[ 0.        ,  0.24778289,  0.02127555,  0.        ,  0.        ,
         0.        ,  0.0912228 ,  0.        ,  0.63971876],
       [ 0.16890167,  0.6911619 ,  0.00847529,  0.        ,  0.        ,
         0.03993636,  0.        ,  0.        ,  0.09152478]])

In [51]:

gtdb_train.target[:2]

array([8, 1])

In [56]:

gs_clf.predict(gtdb_train.data)[:2]

array([1, 1])

In [57]:

"{0:.1f}%".format(probabilities[1][gs_clf.predict(gtdb_train.data)[1]]*100)

'97.9%'

In [58]:

import pandas as pd

pd.DataFrame(gs_clf.cv_results_).head(4).T

Unnamed: 0,0,1,2,3
mean_fit_time,13.5608,48.6585,15.0822,45.7717
mean_score_time,5.517,17.1817,5.746,15.907
mean_test_score,0.538951,0.519046,0.578372,0.57955
mean_train_score,0.537147,0.514392,0.578505,0.579046
param_clf__alpha,0.01,0.01,0.01,0.01
param_clf__loss,log,log,log,log
param_tfidf__use_idf,True,True,False,False
param_vect__ngram_range,"(1, 1)","(1, 2)","(1, 1)","(1, 2)"
params,"{'vect__ngram_range': (1, 1), 'clf__loss': 'lo...","{'vect__ngram_range': (1, 2), 'tfidf__use_idf'...","{'vect__ngram_range': (1, 1), 'clf__loss': 'lo...","{'vect__ngram_range': (1, 2), 'clf__loss': 'lo..."
rank_test_score,15,16,14,13



### Make a pipeline with all the data

In [59]:

import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

t0 = time.time()

parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4),
              'clf__loss': ('log', 'modified_huber'),
}
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(penalty='l2', n_iter=5, random_state=42)),
])
gs_all_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_all_clf = gs_all_clf.fit(gtdb_all.data, gtdb_all.target)

t1 = time.time()
print(t1-t0, time.ctime(t1))

gs_all_clf.best_score_

2050.036012172699 Mon Jun 26 16:57:54 2017


0.74939394132206694


### Add a "predicted" column to all the datasets and save them as CSVs

In [60]:

t0 = time.time()

for df in [acled_df, rand_df, scad_df, ucdp_df, gtdb_df]:
    data = concat_independendent_variables(df).tolist()
    df['predicted_id'] = gs_all_clf.predict(data)
    df['predicted_type'] = df['predicted_id'].map(lambda x: gtdb_all.target_names[x])
    df['probabilities'] = pd.Series(list(gs_all_clf.predict_proba(data)))
    df['probability'] = df.apply(lambda row: "{0:.1f}%".format(row['probabilities'][row['predicted_id']]*100), axis=1)
    df.drop(['predicted_id','probabilities'], axis=1, inplace=True)

t1 = time.time()
print(t1-t0, time.ctime(t1))

1258.0249540805817 Mon Jun 26 17:20:43 2017


In [61]:

csv_folder = "../data/csv/"
gtdb_df.to_csv(csv_folder+"gtdb_df.csv", sep=',', encoding=encoding, index=False)
acled_df.to_csv(csv_folder+"acled_df.csv", sep=',', encoding=encoding, index=False)
rand_df.to_csv(csv_folder+"rand_df.csv", sep=',', encoding=encoding, index=False)
scad_df.to_csv(csv_folder+"scad_df.csv", sep=',', encoding=encoding, index=False)
ucdp_df.to_csv(csv_folder+"ucdp_df.csv", sep=',', encoding=encoding, index=False)


### Play around with TPOT

In [1]:

from tpot import TPOTRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

housing = load_boston()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('../py/tpot_boston_pipeline.py')





Generation 1 - Current best internal CV score: 11.119472668264883




Generation 2 - Current best internal CV score: 11.119472668264883




Generation 3 - Current best internal CV score: 11.119472668264883




Generation 4 - Current best internal CV score: 11.119472668264883




Generation 5 - Current best internal CV score: 11.119472668264883





Best pipeline: ElasticNetCV(GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.95, GradientBoostingRegressor__learning_rate=DEFAULT, GradientBoostingRegressor__loss=ls, GradientBoostingRegressor__max_depth=7, GradientBoostingRegressor__max_features=0.25, GradientBoostingRegressor__min_samples_leaf=4, GradientBoostingRegressor__min_samples_split=2, GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.55), ElasticNetCV__l1_ratio=0.25, ElasticNetCV__tol=0.01)
14.0213084208



### The output looks like this:

In [None]:

import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = train_test_split(features, tpot_data['class'], 
                                                                                        random_state=42)

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.95, loss="ls", max_depth=7, max_features=0.25, 
                                                          min_samples_leaf=4, min_samples_split=2, n_estimators=100, 
                                                          subsample=0.55)),
    ElasticNetCV(l1_ratio=0.25, tol=0.01)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [17]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.75, random_state=0)

In [18]:

from os import listdir
from os.path import isfile, join
import numpy as np

class Bunch(dict):
    """Container object for datasets: dictionary-like object that
       exposes its keys as attributes."""

    def __init__(self, **kwargs):
        dict.__init__(self, kwargs)
        self.__dict__ = self

csv_path = "../data/csv/"
csv_files = [join(csv_path, f) for f in listdir(csv_path) if isfile(join(csv_path, f))]
gtdb_train = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y_train.tolist()),
                   data=X_train.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")
gtdb_test = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y_test.tolist()),
                   data=X_test.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")
gtdb_all = Bunch(filenames=np.asarray(csv_files),
                   target_names=attack_types_df['Attack Type'].tolist(),
                   DESCR=None,
                   target=np.asarray(y.tolist()),
                   data=X.tolist(),
                   description="The GTDB dataset concatoned into one column (minus the target columns)")

In [19]:

from tpot import TPOTClassifier
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(gtdb_train.data)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [20]:

# (105501, 97921) is too big
# (53302, 65931) is too big
# (39193, 55340)
X_train_tf.shape

(39193, 55340)

In [21]:

X_train_tf.toarray()

MemoryError: 

In [16]:

tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train_tf.toarray(), y_train)
np.array([np.nan, 0], dtype=np.float64)

count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(gtdb_test.data)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_test_counts)
X_test_tf = tf_transformer.transform(X_test_counts)
print(tpot.score(X_test_tf, y_test))

tpot.export('../py/tpot_attack_pipeline.py')



MemoryError: 