## Attack SVC

In [1]:
#imports

#general imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#modeling imports
from eda_functions_1 import split_data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

#nlp imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
#reading in data for toxicity
df = pd.read_csv('../clean data/attack_clean.csv')

In [3]:
df.head()

Unnamed: 0,rev_id,comment,target
0,37675,This is not creative Those are the dictionar...,0
1,44816,the term standard model is itself less NPOV...,0
2,49851,True or false the situation as of March 2002...,0
3,89320,Next maybe you could work on being less conde...,0
4,93890,This page will need disambiguation,0


In [4]:
#baseline score
df.target.value_counts(normalize=True)

0    0.865338
1    0.134662
Name: target, dtype: float64

In [5]:
#splitting data with custom function (christian wrote in script)
X_train, X_test, y_train, y_test = split_data(df,0.5)

In [6]:
#checking to see that training data is balanced 50/50
y_train.value_counts(normalize=True)

1    0.5
0    0.5
Name: target, dtype: float64

In [7]:
#checking to see that test data is not balanced 50/50 and has retained original splits
y_test.value_counts(normalize=True)

0    0.862438
1    0.137562
Name: target, dtype: float64

In [8]:
#creating pipeline for SVC
pipe = Pipeline([
    ('cvec',CountVectorizer()),
    ('svc',SVC())
])

In [8]:
#creating first param dict and grid search
params = {
    'cvec__max_features':[2000,3000,4000],
    'cvec__ngram_range':[(1,1),(1,2),(2,2)],
    'cvec__stop_words':[stopwords.words('english')]
}
gs_1 = GridSearchCV(pipe,param_grid=params,cv=5)
gs_1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'cvec__max_features': [2000, 3000, 4000],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                               'hers', 'herself', 'it', "it's",
                                               'its', 'itself', ...]]})

In [9]:
#printing best score from first test
gs_1.best_score_

0.7990749306197964

In [10]:
#printing best params of first test
gs_1.best_params_

{'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in

In [11]:
#creating second param dict for next iteration of test for grid search
params_2 = {
    'cvec__max_features':[4000,5000,6000],
    'cvec__stop_words':[stopwords.words('english')],
    'cvec__min_df':[2,3]
}
gs_2 = GridSearchCV(pipe,param_grid=params_2,cv=3)
gs_2.fit(X_train,y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'cvec__max_features': [4000, 5000, 6000],
                         'cvec__min_df': [2, 3],
                         'cvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                               'hers', 'herself', 'it', "it's",
                                               'its', 'itself', ...]]})

In [12]:
#best score for second test
gs_2.best_score_

0.8055504881915785

In [13]:
#best params for second test
gs_2.best_params_

{'cvec__max_features': 6000,
 'cvec__min_df': 2,
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out'

In [9]:
#creating third param dict for next iteration of testing
params_3 = {
    'cvec__max_features':[7000,9000,11000],
    'cvec__stop_words':[stopwords.words('english')],
    'cvec__min_df':[4,5],
    'cvec__ngram_range':[(1,1),(2,2),(3,3),(4,4)],
    'cvec__analyzer':['word','char_wb']
}
gs_3 = GridSearchCV(pipe,param_grid=params_3,cv=3)
gs_3.fit(X_train,y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'cvec__analyzer': ['word', 'char_wb'],
                         'cvec__max_features': [7000, 9000, 11000],
                         'cvec__min_df': [4, 5],
                         'cvec__ngram_range': [(1, 1), (2, 2), (3, 3), (4, 4)],
                         'cvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                               'hers', 'hersel

In [10]:
#best score for third test
gs_3.best_score_

0.7974606835728237

In [11]:
#best params for third test
gs_3.best_params_

{'cvec__analyzer': 'word',
 'cvec__max_features': 7000,
 'cvec__min_df': 5,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'be