In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import LinearSVC, NuSVC, SVC

In [2]:
from google.colab import files

In [3]:
file = files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train.csv


In [4]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

In [5]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [6]:
def keyword_mean(x):
  try:
    targets = keywords_target[x]
  except KeyError:
    return 0.5
  if len(targets) == 2:
    return targets[0]/(targets[0]+targets[1])
  else:
    try:
      return targets[0]/targets[0]
    except KeyError:
      return 0

In [7]:
df["keyword_mean"] = df["keyword"].transform(keyword_mean)

In [8]:
df["keyword_en_tweet"] = df.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

In [38]:
locations_target = df.groupby(["location", "target"]).count()["id"]

In [39]:
def location_mean(x):
  try:
    targets = locations_target[x]
  except KeyError:
    return 0.5
  if len(targets) == 2:
    return targets[0]/(targets[0]+targets[1])
  else:
    try:
      return targets[0]/targets[0]
    except KeyError:
      return 0

In [40]:
df["location_mean"] = df["location"].transform(location_mean)

In [9]:
df["long"] = df["text"].transform(lambda x: len(x))
long_max = df["long"].max()
df["long"] = df["long"]/long_max

In [10]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))
nro_term_max = df["nro_term"].max()
df["nro_term"] = df["nro_term"]/nro_term_max

In [41]:
def hay_url(x):
    if re.search('https{0,1}:\/\/\S*', x) is not None:
        return 1
    else:
        return 0
    
df["hay_url"] = df["text"].transform(hay_url)

In [42]:
def hay_nros(x):
    x = x.split()
    for i in x:
        i = i.replace(',','')
        try:
            float(i)
            return 1
        except ValueError:
            continue
    return 0 

df["hay_nros"] = df["text"].transform(hay_nros)

In [43]:
def hay_mencion(x):
    for i in x.split():
        if i[0]=='@':
            return 1
    return 0

df["hay_mencion"] = df["text"].transform(hay_mencion)

In [44]:
def hay_hashtag(x):
    for i in x.split():
        if i[0]=='#':
            return 1
    return 0

df["hay_hashtag"] = df["text"].transform(hay_hashtag)

In [45]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [46]:
c_vect = CountVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

bow_cols = pd.DataFrame(c_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

df = pd.concat(objs=[df, bow_cols], axis=1)

In [47]:
df_f = df.drop(labels=["id", "text", "keyword", "location", "target"], axis=1)

# KNN

In [11]:
knn = KNeighborsClassifier(n_jobs=-1)

In [12]:
df_f = df[["keyword_mean","long", "nro_term"]]

In [13]:
score = cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")
score, score.mean()

(array([0.74720946, 0.68351937, 0.6651346 , 0.73981603, 0.75755585]),
 0.718647061310454)

In [14]:
gridcv = GridSearchCV(knn, {"n_neighbors":[5,13,21], "weights":["uniform", "distance"], "p":[0.5,1,2]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [5, 13, 21], 'p': [0.5, 1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [15]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}

In [16]:
gridcv.best_score_

0.7400573596444531

In [17]:
gridcv = GridSearchCV(knn, {"n_neighbors":[21, 27, 31, 37]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [21, 27, 31, 37]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [18]:
gridcv.best_params_

{'n_neighbors': 37}

In [19]:
gridcv.best_score_

0.7437353483985805

In [20]:
gridcv = GridSearchCV(knn, {"n_neighbors":[37, 39, 41, 43]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [37, 39, 41, 43]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [21]:
gridcv.best_params_

{'n_neighbors': 41}

In [22]:
gridcv.best_score_

0.7459653860613022

In [23]:
knn = KNeighborsClassifier(n_neighbors=41, n_jobs=-1)

Probar KNN con los mejores tres features que se encuentren.

# Para hacer un submit:

In [35]:
def hacer_submit(modelo, train, target, test, test_id, nombre_submit):
  modelo.fit(train, target)
  res = modelo.predict(test)
  pd.concat(objs=[test_id, pd.Series(res)], axis=1).to_csv(nombre_submit, index=False)


In [25]:
file = files.upload()
test = pd.read_csv("test.csv")

Saving test.csv to test.csv


In [26]:
test['id'] = test['id'].astype(np.uint16)
test = test.fillna("")

In [27]:
test["keyword_mean"] = test["keyword"].transform(keyword_mean)

In [53]:
test["keyword_en_tweet"] = test.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

In [None]:
#test["keyword_count_norm"] = test["keyword"].transform(lambda x: keywords[x]/keywords.max())

In [54]:
test["location_mean"] = test["location"].transform(location_mean)

In [28]:
test["long"] = test["text"].transform(lambda x: len(x))
long_max = test["long"].max()
test["long"] = test["long"]/long_max

In [29]:
test["nro_term"] = test["text"].transform(lambda x: len(x.split()))
nro_term_max = test["nro_term"].max()
test["nro_term"] = test["nro_term"]/nro_term_max

In [55]:
test["hay_url"] = test["text"].transform(hay_url)

In [56]:
test["hay_nros"] = test["text"].transform(hay_nros)

In [57]:
test["hay_mencion"] = test["text"].transform(hay_mencion)

In [58]:
test["hay_hashtag"] = test["text"].transform(hay_hashtag)

In [59]:
bow_cols = pd.DataFrame(c_vect.transform(test["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

test = pd.concat(objs=[test, bow_cols], axis=1)

In [60]:
#test_f = test[["keyword_mean", "long", "nro_term"]]
test_f = test.drop(labels=["id", "text", "keyword", "location"], axis=1)

In [61]:
hacer_submit(dt, df_f, df["target"], test_f, test["id"], "dt.csv")

In [62]:
files.download("dt.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Decision Tree

In [48]:
dt = DecisionTreeClassifier()

In [64]:
score = cross_val_score(dt, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")
score, score.mean()

(array([0.69468155, 0.56336179, 0.60341431, 0.70302234, 0.67411301]),
 0.6477185995204499)

In [65]:
gridcv = GridSearchCV(dt, {"criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': [0, 0.5, 1],
                         '

In [51]:
gridcv.best_params_

{'ccp_alpha': 0, 'criterion': 'gini', 'min_samples_split': 2}

In [52]:
gridcv.best_score_

0.8295068261255579

In [None]:
dt = DecisionTreeClassifier()

# ExtraTree

In [None]:
et = ExtraTreeClassifier()

In [None]:
cross_val_score(et, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.68286277, 0.65331582, 0.58502955, 0.68659658, 0.70630749])

In [None]:
gridcv = GridSearchCV(et, {"criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=7,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': [0, 0.5, 1],
                         'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 5, 7, 9]},


In [None]:
gridcv.best_params_

{'ccp_alpha': 0, 'criterion': 'entropy', 'min_samples_split': 9}

In [None]:
gridcv.best_score_

0.6917122733935978

In [None]:
et = ExtraTreeClassifier(criterion="entropy")

In [None]:
gridcv = GridSearchCV(et, {"min_samples_split":[9,11,13,15]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'min_samples_split': [9, 11, 13, 15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy'

In [None]:
gridcv.best_params_

{'min_samples_split': 13}

In [None]:
gridcv.best_score_

0.6588812108337943

In [None]:
et = ExtraTreeClassifier(criterion="entropy", min_samples_split=13)

In [None]:
gridcv = GridSearchCV(et, {"min_samples_leaf":[1,2,5,7,9,11]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=13,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 5, 7, 9, 11]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accur

In [None]:
gridcv.best_params_

{'min_samples_leaf': 1}

In [None]:
gridcv.best_score_

0.6781893575771589

In [None]:
et = ExtraTreeClassifier(criterion="entropy", min_samples_split=13)

# Random Forest

In [None]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
cross_val_score(rf, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.63821405, 0.7209455 , 0.73342088, 0.77463863, 0.64454665])

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[80,100,120], "criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [None]:
gridcv.best_params_

{'ccp_alpha': 0,
 'criterion': 'gini',
 'min_samples_split': 2,
 'n_estimators': 120}

In [None]:
gridcv.best_score_

0.7295491901228901

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[120,140,160]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [None]:
gridcv.best_params_

{'n_estimators': 160}

In [None]:
gridcv.best_score_

0.7103681353715219

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[160,180,200]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [None]:
gridcv.best_params_

{'n_estimators': 200}

In [None]:
gridcv.best_score_

0.7132582918249565

In [None]:
rf = RandomForestClassifier(n_estimators=160, oob_score=True, n_jobs=-1)

# XGBoost

In [None]:
xg = xgb.XGBClassifier(n_jobs=-1)

In [None]:
cross_val_score(xg, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.69862114, 0.71372292, 0.71372292, 0.81406045, 0.69842313])

In [None]:
gridcv = GridSearchCV(xg, {"learning_rate":[0.1,0.5,1], "gamma":[0.1,0.5,1], "reg_alpha":[0.1,0.5,1], "reg_lambda":[0.1,0.5,1]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [0.1, 0.5, 1], 'learning_rate': [0.1, 0.5, 1],
                         'reg_alpha': [0.1, 0.5, 1],
                         'reg_lambda': [0

In [None]:
gridcv.best_params_

{'gamma': 1, 'learning_rate': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1}

In [None]:
gridcv.best_score_

0.7337538384283733

In [None]:
gridcv = GridSearchCV(xg, {"learning_rate":[0.05,0.1,0.15], "gamma":[1,2,3], "reg_alpha":[0.05,0.1,0.15], "reg_lambda":[1,2,3]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [1, 2, 3], 'learning_rate': [0.05, 0.1, 0.15],
                         'reg_alpha': [0.05, 0.1, 0.15],
                         'reg_lambda'

In [None]:
gridcv.best_params_

{'gamma': 1, 'learning_rate': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1}

In [None]:
gridcv.best_score_

0.7337538384283733

In [None]:
xg = xgb.XGBClassifier(gamma=1, reg_alpha=0.1, n_jobs=-1)

In [None]:
gridcv = GridSearchCV(xg, {"n_estimators":[80,100,120], "max_depth":[3,7,11,15]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0.1,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': [3, 7, 11, 15],
                         'n_estimators': [80, 100, 120]},
             pre_dispatc

In [None]:
gridcv.best_params_

{'max_depth': 3, 'n_estimators': 120}

In [None]:
gridcv = GridSearchCV(xg, {"n_estimators":[120,140,160]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0.1,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_estimators': [120, 140, 160]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 

In [None]:
gridcv.best_params_

{'n_estimators': 120}

In [None]:
gridcv.best_score_

0.7349357163009932

In [None]:
xg = xgb.XGBClassifier(n_estimators=120, gamma=1, reg_alpha=0.1, n_jobs=-1)

# LinearSVC

In [None]:
lsvc = LinearSVC()

In [None]:
cross_val_score(lsvc, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.85292186, 0.82600131, 0.82206172, 0.81537451, 0.75821288])

In [None]:
gridcv = GridSearchCV(lsvc, {"penalty":["l1","l2"], "loss":["hinge","squared_hinge"], "C":[0.5,1,1.5], "max_iter":[3500,4000,4500]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.5, 1, 1.5], 'loss': ['hinge', 'squared_hinge'],
                         'max_iter': [3500, 4000, 4500],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.5, 'loss': 'hinge', 'max_iter': 3500, 'penalty': 'l2'}

In [None]:
gridcv.best_score_

0.8409363047377789

In [None]:
lsvc = LinearSVC(loss="hinge", max_iter=3500)

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.25,0.5,0.75]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.25, 0.5, 0.75]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.25}

In [None]:
gridcv.best_score_

0.8510503424063612

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.15,0.25,0.35]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.15, 0.25, 0.35]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.15}

In [None]:
gridcv.best_score_

0.8576171071170652

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.1,0.15,0.2]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.1, 0.15, 0.2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.1}

In [None]:
gridcv.best_score_

0.8627396995521149

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.05,0.1,0.12]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.05, 0.1, 0.12]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.05}

In [None]:
gridcv.best_score_

0.8681246726712526

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.02,0.05,0.07]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.02, 0.05, 0.07]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.02}

In [None]:
gridcv.best_score_

0.8706197481801169

In [None]:
gridcv = GridSearchCV(lsvc, {"C":[0.001,0.005,0.02]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='hinge', max_iter=3500, multi_class='ovr',
                                 penalty='l2', random_state=None, tol=0.0001,
                                 verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.005, 0.02]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring='accuracy',
             verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.02}

In [None]:
gridcv.best_score_

0.8706197481801169

In [None]:
lsvc = LinearSVC(loss="hinge", C=0.02, max_iter=3500)

# NuSVC

In [None]:
nusvc = NuSVC()

In [None]:
cross_val_score(nusvc, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.88181221, 0.85423506, 0.86277085, 0.88173456, 0.89421813])

In [None]:
pd.Series([0.88181221, 0.85423506, 0.86277085, 0.88173456, 0.89421813]).mean()

0.8749541619999999

In [None]:
gridcv = GridSearchCV(nusvc, {"nu":[0.25,0.5,0.75], "gamma":["scale","auto"]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': ['scale', 'auto'], 'nu': [0.25, 0.5, 0.75]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'gamma': 'scale', 'nu': 0.5}

In [None]:
gridcv.best_score_

0.8749541631902593

In [None]:
gridcv = GridSearchCV(nusvc, {"nu":[0.4,0.5,0.6]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'nu': [0.4, 0.5, 0.6]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'nu': 0.4}

In [None]:
gridcv.best_score_

0.8753489852916687

In [None]:
gridcv = GridSearchCV(nusvc, {"nu":[0.35,0.4,0.45]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'nu': [0.35, 0.4, 0.45]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'nu': 0.45}

In [None]:
gridcv.best_score_

0.8777127410369084

In [None]:
nusvc = NuSVC(nu=0.45)

# SVC

In [None]:
svc = SVC()

In [None]:
cross_val_score(svc, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.88443861, 0.84832567, 0.83847669, 0.87976347, 0.89618922])

In [None]:
pd.Series([0.88443861, 0.84832567, 0.83847669, 0.87976347, 0.89618922]).mean()

0.8694387319999999

In [None]:
gridcv = GridSearchCV(svc, {"C":[0.5,1,1.5], "gamma":["scale","auto"]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.5, 1, 1.5], 'gamma': ['scale', 'auto']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'C': 0.5, 'gamma': 'scale'}

In [None]:
gridcv.best_score_

0.8748237062371711