## Toxicity SVC

In [1]:
#imports

#general imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#modeling imports
from eda_functions_1 import split_data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

#nlp imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
#reading in data for toxicity
df = pd.read_csv('../clean data/toxicity_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,rev_id,comment,toxicity
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [4]:
#renaming y column for custom function
df.rename(columns={'toxicity':'target'},inplace=True)

In [5]:
#baseline score
df.target.value_counts(normalize=True)

0    0.884373
1    0.115627
Name: target, dtype: float64

In [6]:
#splitting data with custom function (christian wrote in script)
X_train, X_test, y_train, y_test = split_data(df,0.5)

In [7]:
#checking to see that training data is balanced 50/50
y_train.value_counts(normalize=True)

1    0.5
0    0.5
Name: target, dtype: float64

In [8]:
#checking to see that test data is not balanced 50/50 and has retained original splits
y_test.value_counts(normalize=True)

0    0.885633
1    0.114367
Name: target, dtype: float64

In [9]:
#creating pipeline for SVC
pipe = Pipeline([
    ('cvec',CountVectorizer()),
    ('svc',SVC())
])

In [10]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('svc', SVC())])

In [11]:
pipe.score(X_train,y_train)

0.8290389972144847

In [12]:
pipe.score(X_test,y_test)

0.7913380872483221

In [13]:
#creating first param dict and grid search
params = {
    'cvec__max_features':[2000,3000,4000],
    'cvec__ngram_range':[(1,1),(1,2),(2,2)],
    'cvec__stop_words':[stopwords.words('english')]
}
gs_1 = GridSearchCV(pipe,param_grid=params,cv=5)
gs_1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'cvec__max_features': [2000, 3000, 4000],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                               'hers', 'herself', 'it', "it's",
                                               'its', 'itself', ...]]})

In [14]:
#printing best score from first test
gs_1.best_score_

0.8336424362100778

In [15]:
#printing best params of first test
gs_1.best_params_

{'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in

In [16]:
#creating second param dict for next iteration of test for grid search
params_2 = {
    'cvec__max_features':[4000,5000,6000],
    'cvec__stop_words':[stopwords.words('english')],
    'cvec__min_df':[2,3]
}
gs_2 = GridSearchCV(pipe,param_grid=params_2,cv=3)
gs_2.fit(X_train,y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'cvec__max_features': [4000, 5000, 6000],
                         'cvec__min_df': [2, 3],
                         'cvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                               'hers', 'herself', 'it', "it's",
                                               'its', 'itself', ...]]})

In [17]:
#best score for second test
gs_2.best_score_

0.8293999691500847

In [18]:
#best params for second test
gs_2.best_params_

{'cvec__max_features': 6000,
 'cvec__min_df': 3,
 'cvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out'