# Supervised learning

In [None]:
from sklearn import datasets
# https://archive.ics.uci.edu/ml/datasets/Wine+Quality
wine=datasets.load_wine()
type(wine)

In [None]:
print(wine.keys())
print(wine.data)
print(wine.target)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(
    wine.data, wine.target, test_size=0.30, random_state=42)

In [None]:
print(type(x_train))
x_train

In [None]:
print(type(y_train))
y_train

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)

In [None]:
knn.fit(x_train, y_train)
y_pred=knn.predict(x_test)
y_pred

In [None]:
y_test

In [None]:
knn.score(x_test, y_test)

![Image](https://miro.medium.com/max/1328/1*6Vy8J9kL_iXZeAh5KrZgCw.png)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

$$precission=\dfrac{TP}{TP+FP}$$
$$recall=\dfrac{TP}{TP+FN}$$
$$f1-score=\dfrac{recall+precission}{2}$$

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
import pandas as pd

clima=pd.read_csv("Summary of Weather.csv")
clima.head()

In [None]:
clima.describe()

In [None]:
clima=clima.drop(['WindGustSpd', 'DR', 'SPD', 'FT','FB','FTI','ITH',
                  'PGT','SD3','RHX','RHN','RVG','WTE', 'PoorWeather',
                  'MaxTemp','MinTemp','MeanTemp','SND','TSHDSBRSGF',
                  'Precip', 'Snowfall','SNF'], axis=1)
clima=clima.dropna()
clima=clima[clima['PRCP']!='T']
clima['Date']=clima['Date'].astype('datetime64')
clima.head()

In [None]:
x=clima['MAX'].values.reshape(-1, 1)
y=clima['MIN'].values
x

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x,y)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(
    x, y, test_size=0.30, random_state=42)

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

reg=LinearRegression()
reg.fit(x_train, y_train)
prediction_space = np.linspace(min(x_train),
                               max(x_train)).reshape(-1, 1)

In [None]:
plt.scatter(x_train, y_train, color='blue')
plt.plot(prediction_space, reg.predict(prediction_space),
         color='black', linewidth=3)
plt.show()

In [None]:
y_pred=reg.predict(x_test)

In [None]:
plt.hist(abs(y_pred-y_test), bins=20)
plt.show()

In [None]:
reg.score(x_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(reg, x, y, cv=5)
print(cv_results)

In [None]:
np.mean(cv_results)

In [None]:
x=clima.drop(['MIN', 'Date'],axis=1)
names=list(x.columns)
x=x.values
y=clima['MIN'].values

$$RL+\alpha\sum ^n_{i=1}a_i^2$$

In [None]:
from sklearn.linear_model import Ridge
import numpy as np

ridge = Ridge(normalize=True)
ridge_scores = []
ridge_scores_std = []

for alpha in np.logspace(-4, 0, 50):
    ridge.alpha = alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge,x,y,cv=10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

In [None]:
plt.plot(np.logspace(-4, 0, 50), ridge_scores)
plt.plot(np.logspace(-4, 0, 50), ridge_scores_std, color='r')
plt.show()

$$RL-\alpha\sum ^n_{i=1}|a_i|$$

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(x, y).coef_

plt.plot(range(len(names)), lasso_coef)
plt.xticks(range(len(names)), names, rotation=60)
plt.ylabel('Coefficients')
plt.show()

In [None]:
import pandas as pd

diabetes=pd.read_csv('diabetes.csv')
diabetes.head()

In [None]:
from sklearn.linear_model import LogisticRegression
X=diabetes.drop(['Outcome'], axis=1)
y=diabetes['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

logreg=LogisticRegression()
logreg.fit(X_train, y_train)

## Receiver Operating Characteristic (ROC) Curve

In [None]:
from sklearn.metrics import roc_curve

# probabilidades predecidas
y_pred_prob = logreg.predict_proba(X_test)[:,1]

#false positive rate, true positive rate, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# area bajo roc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

y_pred_prob = logreg.predict_proba(X_test)[:,1]

print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc')

print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


Optimizacion de hiperparametros

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

logreg_cv.fit(X,y)

print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))


In [None]:
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
#Randomized Search es peor pero mas rapido
from sklearn.model_selection import RandomizedSearchCV

param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# No veremos arboles, les recomiendo buscarlos
tree = DecisionTreeClassifier()

tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

tree_cv.fit(X,y)

print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


Ahora si, como entrenar

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

logreg = LogisticRegression()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

logreg_cv.fit(X_train,y_train)

print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Laso es l1, ridge es l2
l1_ratio=1 es l2=0 

In [None]:
# Import necessary modules
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio': l1_space}

# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)

# Fit it to the training data
gm_cv.fit(X_train,y_train)

# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))


Preposecing
Pandas:get_dummies()
scikit-learn:OneHotEncoder()


In [None]:
# Import pandas
import pandas as pd

df = pd.read_csv('mushrooms.csv')

df.head()

In [None]:
df['class']=df['class'].replace(('p', 'e'),(1, 0))
df['gill-attachment']=df['gill-attachment'].replace(('a', 'f'),(1, 0))
df['bruises']=df['bruises'].replace(('t', 'f'),(1, 0))

df.head()

In [None]:
from sklearn.linear_model import LogisticRegression

dfn=df._get_numeric_data()
logreg = LogisticRegression()

X=dfn.drop(['class'], axis=1)
y=dfn['class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

logreg.fit(X_train, y_train)
logreg.score(X_test,y_test)

In [None]:
# Create dummy variables: df_region
df_dum = pd.get_dummies(df)

# Print the columns of df_region
print(df_dum.columns)

# Create dummy variables with drop_first=True: df_region
df_dum2 = pd.get_dummies(df, drop_first=True)

# Print the new columns of df_region
print(df_dum2.columns)


In [None]:
df_dum.head()

In [None]:
df_dum2.head()

In [None]:
from sklearn.linear_model import LogisticRegression

dfn=df_dum._get_numeric_data()
logreg = LogisticRegression()

X=dfn.drop(['class'], axis=1)
y=dfn['class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

logreg.fit(X_train, y_train)
logreg.score(X_test,y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

dfn=df_dum2._get_numeric_data()
logreg = LogisticRegression()

X=dfn.drop(['class'], axis=1)
y=dfn['class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

logreg.fit(X_train, y_train)
logreg.score(X_test,y_test)

Como lidear con datos vacios

In [None]:
df=pd.read_csv('diabetes.csv')
df.head()

In [None]:
df[['SkinThickness','Insulin']]=df[['SkinThickness','Insulin']].replace(0,np.nan)

print(df.isnull().sum())

print("Shape of Original DataFrame: {}".format(df.shape))

df = df.dropna()

print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape))

In [None]:
X=df.drop(['Outcome'],axis=1)
y=df['Outcome']
df.shape

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

svc=SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
df=pd.read_csv('diabetes.csv')
df[['SkinThickness','Insulin']]=df[['SkinThickness','Insulin']].replace(0,np.nan)
X=df.drop(['Outcome'],axis=1)
y=df['Outcome']
print(df.isnull().sum())


In [None]:
from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report


steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),('SVM', SVC())]
#Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
X.describe()

In [None]:
from sklearn.preprocessing import scale

X_scaled=X.dropna()
X_scaled['Insulin'] = scale(X_scaled['Insulin'])

print("Mean of Unscaled Features: {}".format(np.mean(X.dropna()))) 
print("Standard Deviation of Unscaled Features: {}".format(np.std(X.dropna())))

print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) 
print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))


In [None]:
from sklearn.preprocessing import scale
from sklearn.impute import SimpleImputer

imp=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(X)
X_imp=imp.transform(X)

X_scaled = scale(X)

print("Mean of Unscaled Features: {}".format(np.mean(X))) 
print("Standard Deviation of Unscaled Features: {}".format(np.std(X)))

print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) 
print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))

In [None]:
df=pd.read_csv('diabetes.csv')
X=df.drop(['Outcome'],axis=1)
y=df['Outcome']


In [None]:
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),('knn', KNeighborsClassifier())]
        
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=42)

# Fit the pipeline to the training set: knn_scaled
knn_scaled = pipeline.fit(X_train,y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test,y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test,y_test)))


In [None]:
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=21)

cv = GridSearchCV(pipeline,parameters, cv=3)

cv.fit(X_train,y_train)

y_pred = cv.predict(X_test)

print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))


In [None]:
df=pd.read_csv('diabetes.csv')

df[['SkinThickness','Insulin', 'BloodPressure','BMI','Glucose']]=df[['SkinThickness','Insulin', 'BloodPressure','BMI','Glucose']].replace(0,np.nan)

X=df.drop(['Outcome'],axis=1)
y=df['Outcome']

In [None]:
# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),('scaler', StandardScaler()),('knn', KNeighborsClassifier())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'knn__n_neighbors':range(0,30)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4,random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline,parameters, cv=3)

# Fit to the training set
gm_cv.fit(X_train,y_train)

print("Tuned KNN n_neighbors: {}".format(gm_cv.best_params_))

In [None]:
from sklearn.metrics import confusion_matrix


steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='mean')),('scaler', StandardScaler()),('knn', KNeighborsClassifier(n_neighbors=25))]

pipeline = Pipeline(steps)

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))