In [37]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


In [None]:
categorical = pd.read_csv("./files_for_lab/categorical.csv")
numerical = pd.read_csv("./files_for_lab/numerical.csv")
targets = pd.read_csv("./files_for_lab/target.csv")

# Upscaling

### x,y split

In [None]:
X = pd.concat([categorical,numerical],axis=1)
y = targets['TARGET_B']
yD = targets['TARGET_D']

### train - test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
yD_train = yD[y_train.index]
yD_test = yD[y_test.index]

In [None]:
print(len(X_train),len(X_test))

### Normalizing & onehot

In [None]:
# This is not necessary for the tree model but I want to make a pipeline to try different models
# and some of those do require scaling. 

# NUM-CAT split
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

# Normalizing train
transformer = MinMaxScaler().fit(X_train_num)
X_train_num = transformer.transform(X_train_num)
X_train_num = pd.DataFrame(X_train_num, columns=X_train.select_dtypes(np.number).columns)

# Onehot train
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_train_cat)
encoded = encoder.transform(X_train_cat).toarray()
onehot_encoded = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_train_cat.columns))

# Concatenating back to create the transformed X_train
X_train = pd.concat([X_train_num,onehot_encoded],axis=1)

In [None]:
# Transformign X_test for later:

# NUM-CAT split
X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

# Normalizing train
X_test_num = transformer.transform(X_test_num)
X_test_num = pd.DataFrame(X_test_num, columns=X_test.select_dtypes(np.number).columns)

# Onehot train
encoded = encoder.transform(X_test_cat).toarray()
onehot_encoded = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_test_cat.columns))

# Concatenating back to create the transformed X_train
X_test = pd.concat([X_test_num,onehot_encoded],axis=1)

### Oversampling

In [None]:
y.value_counts()

In [None]:
y_train = y_train.reset_index(drop=True)
trainset = pd.concat([X_train,y_train],axis=1)

In [None]:
category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 =trainset[trainset['TARGET_B'] == 1]

category_1_oversample = resample(category_1,
replace=True,
n_samples = len(category_0))

train_upsampled = pd.concat([category_0, category_1_oversample], axis=0)
X_train_upsampled = train_upsampled.drop('TARGET_B',axis=1)
y_train_upsampled = train_upsampled['TARGET_B']

### 1. Apply the Random Forests algorithm.

In [None]:
%%time
# Instead of applying only the random forest, I apply the pipeline we saw in class
# to check every model score (with cross validation)

model1 = DecisionTreeClassifier(max_depth=3)
model2 = LogisticRegression(solver='saga', multi_class='ovr', n_jobs = -1)
model3 = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs = -1)
model4 = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             n_jobs = -1)

#data must be scaled here (it is)
model_pipeline = [model1, model2, model3, model4]
model_names = ['Decision Tree Classifier', 'Logistic Regression', 'KNN', 'Random Forest']
scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_upsampled, y_train_upsampled, cv=5,scoring ='accuracy'))
    scores[model_name] = mean_score
print(scores)

# We can use the result to choose the best performing model

In [None]:
# We can see that we basically get the same results with each model. However KNN 
# seems strangely huge. We must have into account that this score is the trainning score.
# Maybe the KNN is just overfitted.

## 2. Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)

### 2.1 Numericals
I won't drop any categorical because we only have 7. 

Instead of using PCA, I'll check both the variance and chi2 of the numerical features with our target, and drop those who have low variance and low "correlation".

In [None]:
numerical = X_train_num.copy()

#### Checking variance

In [None]:
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

# This drops the columns that have a variance less than this threshold
sel = sel.fit(numerical.select_dtypes(np.number))
#temp = pd.DataFrame(sel.transform(X_train.select_dtypes(np.number)))

In [None]:
sel.get_support()
var_list = list(sel.get_support())
num_var = pd.DataFrame(np.c_[var_list,numerical.columns],columns = ['Var_check', 'Name'])
num_var = num_var[['Name','Var_check']]
num_var

#### Checking  chi2

In [None]:
# To check the scores
model = SelectKBest(chi2, k=10).fit(numerical, y_train)
df = pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns

aux = df.sort_values(by='score',ascending = False).reset_index()

# ranking by chi2_score
for i in range(len(aux)):
    aux.loc[i,'chi2_rank'] = i+1
num_chi2 = aux.sort_values(by='index').set_index('index',drop=True)
num_chi2

In [None]:
num_select_features = pd.merge(left = num_var, right = num_chi2, how = 'inner', left_on = 'Name', right_on = 'Column')[['Column','Var_check','chi2_rank']]
num_select_features
# (chi2_rank should be computed everytime I drop a column but that would take too much
# time so I'll skip that since this lab is pretty long)

In [None]:
# I want to see which  have both a low chi2_rank and a low variance.

# Instead of throwing everything with low variance, I'll throw those that are not
# in the top 50 chi2_rank
dropping = num_select_features[(num_select_features['Var_check']==False)&(num_select_features['chi2_rank']>50)]
dropping = list(dropping['Column'])
len(dropping)

In [None]:
X_train_upsampled

In [None]:
for column in dropping:
    X_train_upsampled = X_train_upsampled.drop([column],axis=1)

In [None]:
%%time
# Instead of applying only the random forest, I apply the pipeline we saw in class
# to check every model score (with cross validation)

model1 = DecisionTreeClassifier(max_depth=3)
model2 = LogisticRegression(solver='saga', multi_class='ovr', n_jobs = -1)
model3 = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs = -1)
model4 = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             n_jobs = -1)

#data must be scaled here (it is)
model_pipeline = [model1, model2, model3, model4]
model_names = ['Decision Tree Classifier', 'Logistic Regression', 'KNN', 'Random Forest']
scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_upsampled, y_train_upsampled, cv=5,scoring ='accuracy'))
    scores[model_name] = mean_score
print(scores)

# We can use the result to choose the best performing model

## 3. Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?

In [None]:
# A false positive is not a big deal, becuase contacting by mail is cheap, but a false
# negative shoud be avoided because that's a real donation that you'll be missing (and
# the mean money donated by donators is way bigger than one mailing cost)

# Lab | Final regression model in "Health Care for All" Case

In [3]:
categorical = pd.read_csv("./files_for_lab/categorical.csv")
numerical = pd.read_csv("./files_for_lab/numerical.csv")
targets = pd.read_csv("./files_for_lab/target.csv")

In [4]:
# Concatenating 
data = pd.concat([categorical,numerical,targets['TARGET_D']],axis=1)
# Filtering out non donors
data = data[data['TARGET_D']!=0].reset_index(drop=True)

X = data.drop(['TARGET_D'],axis=1)
y = data['TARGET_D']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [11]:
# NUM-CAT split
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

# Normalizing train
transformer = MinMaxScaler().fit(X_train_num)
X_train_num = transformer.transform(X_train_num)
X_train_num = pd.DataFrame(X_train_num, columns=X_train.select_dtypes(np.number).columns)

# Onehot train
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_train_cat)
encoded = encoder.transform(X_train_cat).toarray()
onehot_encoded = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_train_cat.columns))

# Concatenating back to create the transformed X_train
X_train = pd.concat([X_train_num,onehot_encoded],axis=1)
y_train = y_train.reset_index(drop=True)

In [43]:
model1 = DecisionTreeRegressor(max_depth=10)
model2 = LinearRegression()
model3 = KNeighborsRegressor(n_neighbors=5)
model4 = RandomForestRegressor(max_depth=10,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             n_jobs = -1)

#data must be scaled here
model_pipeline = [model1, model2, model3, model4]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN','Random Forest']

scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5,scoring='r2'))
    scores[model_name] = mean_score
print(scores)

{'Decision Tree Regressor': 0.03373327772351644, 'Linear Regression': 0.25838096309728054, 'KNN': 0.12278734103063105, 'Random Forest': 0.501674755112925}


In [42]:
# For this case it really seems like a simple linear regression give back better results
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
lm.score(X_train,y_train)

# but the result is still awful, I'm not sure if I did something wrong...

0.5472829696857815