## Clasificación avanzada

#### SVM

El siguiente ejemplo fue tomado de la documentación oficial de scikit-learn 

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', 
                                  categories=categories, 
                                  shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, 
                                 shuffle=True, random_state=42)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
X_train_tf = TfidfVectorizer(use_idf=False).fit_transform(twenty_train.data)
X_train_tf.shape


(2257, 35788)

In [3]:
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', SVC(kernel='linear'))])

In [4]:
import numpy as np
text_clf.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf.predict(twenty_test.data)
# Calculamos accuracy:
np.mean(predicted == twenty_test.target)

0.9207723035952063

In [5]:
tf_idf = TfidfVectorizer()
X_train = tf_idf.fit_transform(twenty_train.data)
X_test = tf_idf.transform(twenty_test.data)
y_train = twenty_train.target
y_test = twenty_test.target

#### Voting

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

svc_clf = SVC(kernel='linear', probability=True) #  para soft
sgd_clf = SGDClassifier(loss='log') #  para soft
voting_clf = VotingClassifier(
estimators=[('svc', svc_clf), ('sgd', sgd_clf)],voting='soft')


for clf in (svc_clf, sgd_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

SVC 0.9207723035952063
SGDClassifier 0.9174434087882823
VotingClassifier 0.9280958721704394


#### Bagging

In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [9]:
accuracy_score(y_test, y_pred)

0.7097203728362184

In [10]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=200, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
accuracy_score(y_test, y_pred)

0.7989347536617842

#### RandomForest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8082556591211718

#### Boosting

#### AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
     DecisionTreeClassifier(max_depth=1), n_estimators=1500,
     algorithm="SAMME.R"
 )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [14]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8049267643142477

#### XgBoost

In [15]:
# pip install xgboost

from xgboost import XGBClassifier

In [16]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [17]:
y_pred = xgb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8621837549933422

#### LightGBM

In [18]:
# conda install -c conda-forge lightgbm
import lightgbm as lgb

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)])

In [None]:
y_pred = gbm.predict(X_test)
accuracy_score(y_test, y_pred)

#### Stacking

In [None]:
# pip install vecstack

In [None]:
from vecstack import StackingTransformer


estimators = [('xgb', xgb_clf),
              ('ada', ada_clf)]
              
# StackingTransformer
stack = StackingTransformer(estimators, regression=False, verbose=2)

# Fit
stack = stack.fit(X_train, y_train)

# stacked features
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)


In [None]:
gbm = lgb.LGBMClassifier()
gbm.fit(S_train, y_train,
        eval_set=[(S_test, y_test)])

In [None]:
y_pred = gbm.predict(S_test)
accuracy_score(y_test, y_pred)