En el siguiente notebook se probarán distintos modelos con algunos features básicos y se intentarán buscar los mejores hiperparámetros para los modelos que den mejores resultados de entrada.

In [30]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier

In [2]:
from google.colab import files

In [3]:
file = files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train.csv


In [4]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

# Features

In [5]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [6]:
def keyword_mean(x):
  try:
      targets = keywords_target[x]
  except KeyError:
      return 0.5
  if len(targets) == 2:
      return targets[0]/(targets[0]+targets[1])
  else:
      try:
          return targets[0]/targets[0]
      except KeyError:
          return 0

In [7]:
df["keyword_mean"] = df["keyword"].transform(keyword_mean)

In [8]:
df["keyword_en_tweet"] = df.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

In [9]:
df["long"] = df["text"].transform(lambda x: len(x))
long_max = df["long"].max()
df["long"] = df["long"]/long_max

In [10]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))
nro_term_max = df["nro_term"].max()
df["nro_term"] = df["nro_term"]/nro_term_max

In [11]:
def hay_url(x):
    if re.search('https{0,1}:\/\/\S*', x) is not None:
        return 1
    else:
        return 0
    
df["hay_url"] = df["text"].transform(hay_url)

In [12]:
def hay_nros(x):
    x = x.split()
    for i in x:
        i = i.replace(',','')
        try:
            float(i)
            return 1
        except ValueError:
            continue
    return 0 

df["hay_nros"] = df["text"].transform(hay_nros)

In [13]:
def hay_mencion(x):
    for i in x.split():
        if i[0]=='@':
            return 1
    return 0

df["hay_mencion"] = df["text"].transform(hay_mencion)

In [14]:
def hay_hashtag(x):
    for i in x.split():
        if i[0]=='#':
            return 1
    return 0

df["hay_hashtag"] = df["text"].transform(hay_hashtag)

In [15]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [16]:
c_vect = CountVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

bow_cols = pd.DataFrame(c_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

df = pd.concat(objs=[df, bow_cols], axis=1)

In [17]:
df_f = df.drop(labels=["id", "text", "keyword", "location", "target"], axis=1)

# KNN

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
df_f = df[["keyword_mean","long", "nro_term"]]

In [None]:
score = cross_val_score(knn, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6241787122207622, MAX:0.7847769028871391, MEAN:0.7053710927395572


In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[5,13,21], "weights":["uniform", "distance"], "p":[0.5,1,2]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [5, 13, 21], 'p': [0.5, 1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}

In [None]:
gridcv.best_score_

0.7365058753332574

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[21, 27, 31, 37]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [21, 27, 31, 37]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 37}

In [None]:
gridcv.best_score_

0.7439939504933761

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[37, 39, 41, 43]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [37, 39, 41, 43]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 41}

In [None]:
gridcv.best_score_

0.7459653860613022

In [None]:
knn = KNeighborsClassifier(n_neighbors=41, n_jobs=-1)

Probar KNN con los mejores tres features que se encuentren.

# Radius Neighbors

In [31]:
rn = RadiusNeighborsClassifier(n_jobs=-1)

In [32]:
score = cross_val_score(rn, df_f[["keyword_mean", "long", "nro_term"]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.5695538057742782, MAX:0.573490813648294, MEAN:0.5707340458920953


# Para hacer un submit:

In [None]:
def hacer_submit(modelo, train, target, test, test_id, nombre_submit):
  modelo.fit(train, target)
  res = modelo.predict(test)
  pd.concat(objs=[test_id, pd.Series(res)], axis=1).to_csv(nombre_submit, index=False)


In [None]:
file = files.upload()
test = pd.read_csv("test.csv")

Saving test.csv to test (2).csv


In [None]:
test['id'] = test['id'].astype(np.uint16)
test = test.fillna("")

In [None]:
test["keyword_mean"] = test["keyword"].transform(keyword_mean)

In [None]:
test["keyword_en_tweet"] = test.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

In [None]:
test["long"] = test["text"].transform(lambda x: len(x))
long_max = test["long"].max()
test["long"] = test["long"]/long_max

In [None]:
test["nro_term"] = test["text"].transform(lambda x: len(x.split()))
nro_term_max = test["nro_term"].max()
test["nro_term"] = test["nro_term"]/nro_term_max

In [None]:
test["hay_url"] = test["text"].transform(hay_url)

In [None]:
test["hay_nros"] = test["text"].transform(hay_nros)

In [None]:
test["hay_mencion"] = test["text"].transform(hay_mencion)

In [None]:
test["hay_hashtag"] = test["text"].transform(hay_hashtag)

In [None]:
bow_cols = pd.DataFrame(c_vect.transform(test["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

test = pd.concat(objs=[test, bow_cols], axis=1)

In [None]:
#test_f = test[["keyword_mean", "long", "nro_term"]]
test_f = test.drop(labels=["id", "text", "keyword", "location"], axis=1)

In [None]:
hacer_submit(nusvc, df_f, df["target"], test_f, test["id"], "nusvc.csv")

In [None]:
files.download("nusvc.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()

In [None]:
score = cross_val_score(dt, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.4283837056504599, MAX:0.6863517060367454, MEAN:0.5528748952373068


# Extra Tree

In [None]:
et = ExtraTreeClassifier()

In [None]:
score = cross_val_score(et, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.5151116951379764, MAX:0.6846254927726675, MEAN:0.615793557999731


# Random Forest

In [None]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [None]:
score = cross_val_score(rf, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.615485564304462, MAX:0.7950065703022339, MEAN:0.6999922398005112


In [None]:
rf.fit(df_f, y=df["target"])
rf.feature_importances_

array([1.57501544e-01, 6.79389853e-03, 4.75845003e-02, ...,
       1.31841517e-04, 7.80842542e-04, 4.23876020e-04])

# Probar

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[80,100,120], "criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, \
                      scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

In [None]:
gridcv.best_params_

{'ccp_alpha': 0,
 'criterion': 'gini',
 'min_samples_split': 2,
 'n_estimators': 120}

In [None]:
gridcv.best_score_

0.7295491901228901

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[120,140,160]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [None]:
gridcv.best_params_

{'n_estimators': 160}

In [None]:
gridcv.best_score_

0.7103681353715219

In [None]:
gridcv = GridSearchCV(rf, {"n_estimators":[160,180,200]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [None]:
gridcv.best_params_

{'n_estimators': 200}

In [None]:
gridcv.best_score_

0.7132582918249565

In [None]:
rf = RandomForestClassifier(n_estimators=160, oob_score=True, n_jobs=-1)

# GradientBoosting

In [None]:
gb = GradientBoostingClassifier()

In [None]:
score = cross_val_score(gb, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.5374507227332457, MAX:0.7792378449408672, MEAN:0.6658352906280933


# XGBoost

In [None]:
xg = xgb.XGBClassifier(n_jobs=-1)

In [None]:
score = cross_val_score(xg, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6176084099868594, MAX:0.7884362680683311, MEAN:0.7128596852463087


# Probar

In [None]:
gridcv = GridSearchCV(xg, {"learning_rate":[0.1,0.5,1], "gamma":[0.1,0.5,1], "reg_alpha":[0.1,0.5,1], "reg_lambda":[0.1,0.5,1]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [0.1, 0.5, 1], 'learning_rate': [0.1, 0.5, 1],
                         'reg_alpha': [0.1, 0.5, 1],
                         'reg_lambda': [0

In [None]:
gridcv.best_params_

{'gamma': 0.5, 'learning_rate': 0.1, 'reg_alpha': 1, 'reg_lambda': 1}

In [None]:
gridcv.best_score_

0.7349365791115294

In [None]:
gridcv = GridSearchCV(xg, {"learning_rate":[0.05,0.1,0.15], "gamma":[1,2,3], "reg_alpha":[0.05,0.1,0.15], "reg_lambda":[1,2,3]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [1, 2, 3], 'learning_rate': [0.05, 0.1, 0.15],
                         'reg_alpha': [0.05, 0.1, 0.15],
                         'reg_lambda'

In [None]:
gridcv.best_params_

{'gamma': 1, 'learning_rate': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1}

In [None]:
gridcv.best_score_

0.7337538384283733

In [None]:
xg = xgb.XGBClassifier(gamma=1, reg_alpha=0.1, n_jobs=-1)

In [None]:
gridcv = GridSearchCV(xg, {"n_estimators":[80,100,120], "max_depth":[3,7,11,15]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0.1,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': [3, 7, 11, 15],
                         'n_estimators': [80, 100, 120]},
             pre_dispatc

In [None]:
gridcv.best_params_

{'max_depth': 3, 'n_estimators': 120}

In [None]:
gridcv = GridSearchCV(xg, {"n_estimators":[120,140,160]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0.1,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_estimators': [120, 140, 160]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 

In [None]:
gridcv.best_params_

{'n_estimators': 120}

In [None]:
gridcv.best_score_

0.7349357163009932

In [None]:
xg = xgb.XGBClassifier(n_estimators=120, gamma=1, reg_alpha=0.1, n_jobs=-1)

# LinearSVC

In [None]:
lsvc = LinearSVC()

In [None]:
score = cross_val_score(lsvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.4730617608409987, MAX:0.6968503937007874, MEAN:0.5978002421182241


# NuSVC

In [18]:
nusvc = NuSVC()

In [None]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6614173228346457, MAX:0.8055190538764783, MEAN:0.7371682514718512


In [None]:
gridcv = GridSearchCV(nusvc, {"nu":[0.4,0.5,0.6], "gamma":["scale","auto"]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': ['scale', 'auto'], 'nu': [0.4, 0.5, 0.6]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'gamma': 'scale', 'nu': 0.6}

In [None]:
gridcv.best_score_

0.7556897782652333

In [None]:
gridcv = GridSearchCV(nusvc, {"nu":[0.55,0.6,0.65]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'nu': [0.55, 0.6, 0.65]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'nu': 0.65}

In [None]:
gridcv.best_score_

0.7588412815020986

In [19]:
gridcv = GridSearchCV(nusvc, {"nu":[0.65,0.7,0.75]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=NuSVC(break_ties=False, cache_size=200,
                             class_weight=None, coef0=0.0,
                             decision_function_shape='ovr', degree=3,
                             gamma='scale', kernel='rbf', max_iter=-1, nu=0.5,
                             probability=False, random_state=None,
                             shrinking=True, tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'nu': [0.65, 0.7, 0.75]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [20]:
gridcv.best_params_

{'nu': 0.7}

In [21]:
gridcv.best_score_

0.7597606064682125

In [None]:
nusvc = NuSVC(nu=0.7)

# SVC

In [22]:
svc = SVC()

In [None]:
score = cross_val_score(svc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6482939632545932, MAX:0.8042049934296978, MEAN:0.7330955263312191


In [23]:
gridcv = GridSearchCV(svc, {"C":[0.1,0.5,1], "gamma":["scale","auto"]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 1], 'gamma': ['scale', 'auto']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [25]:
gridcv.best_params_

{'C': 0.1, 'gamma': 'scale'}

In [26]:
gridcv.best_score_

0.7576582822022411

In [27]:
gridcv = GridSearchCV(svc, {"C":[0.05,0.1,0.15]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.05, 0.1, 0.15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [28]:
gridcv.best_params_

{'C': 0.15}

In [29]:
gridcv.best_score_

0.7598916676151355

In [35]:
gridcv = GridSearchCV(svc, {"C":[0.15,0.2,0.25]}, scoring="accuracy", n_jobs=-1, cv=10)
gridcv.fit(df_f, y=df["target"])



GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1, param_grid={'C': [0.15, 0.2, 0.25]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [36]:
gridcv.best_params_

{'C': 0.15}

In [37]:
gridcv.best_score_

0.7598916676151355

In [39]:
svc = SVC(C=0.15)

# MLPClassifier

In [None]:
mlp = MLPClassifier()

In [None]:
score = cross_val_score(mlp, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.44152431011826543, MAX:0.6876640419947506, MEAN:0.5771744941212177


# Logistic Regression

In [None]:
lr = LogisticRegression(n_jobs=-1)

In [None]:
score = cross_val_score(lr, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.5997375328083989, MAX:0.7516425755584757, MEAN:0.6846268723636878


# Passive Agressive 

In [None]:
pa = PassiveAggressiveClassifier(n_jobs=-1)

In [None]:
score = cross_val_score(pa, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.45992115637319314, MAX:0.6968503937007874, MEAN:0.5845311632366584


# Perceptron

In [None]:
p = Perceptron(n_jobs=-1)

In [None]:
score = cross_val_score(p, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.4704336399474376, MAX:0.6916010498687664, MEAN:0.5815121007377362


# Ridge

In [None]:
r = RidgeClassifier()

In [None]:
score = cross_val_score(r, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.48226018396846254, MAX:0.7165354330708661, MEAN:0.6094838260197764


# SGD

In [None]:
sgd = SGDClassifier(n_jobs=-1)

In [None]:
score = cross_val_score(sgd, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.4914586070959264, MAX:0.7165354330708661, MEAN:0.6210468681559351
