# Assessment Task 1

## Set up environment

In [1]:
# Load required packages
import pandas as pd
import numpy as np
from joblib import dump
from joblib import load
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt 
from pandas_profiling import ProfileReport

## Get data

In [2]:
# Import and inspect training data
df_train = pd.read_csv('../../data/raw/train.csv')
#display(df_train)

# Import and inspect test data
df_test = pd.read_csv('../../data/raw/test.csv')
#display(df_test)

## Data exploration

## Data manipulation

In [3]:
# Drop ID column - training data
df_train = df_train.drop(['Id_old','Id'], axis=1)

# Identify target
features = df_train.iloc[:,:-1].to_numpy()
target = df_train.iloc[:,-1].to_numpy()

# Standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Save scaler into models folder
dump(scaler, '../../models/David/scaler.joblib')

# Split into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=8)

In [4]:
# Resample training data
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [5]:
# Drop ID column - test data
df_test = df_test.drop(['Id_old','Id'], axis=1)

# Standardize features - test data
X_test = scaler.fit_transform(df_test)

In [6]:
# Save split datasets into data folder
#np.save('../../data/processed/X_train', X_train)
#np.save('../../data/processed/X_train_res', X_train_res)
#np.save('../../data/processed/X_val', X_val)
#np.save('../../data/processed/y_train', y_train)
#np.save('../../data/processed/y_train_res', y_train_res)
#np.save('../../data/processed/y_val', y_val)
#np.save('../../data/processed/X_test', X_test)

## Set up experiment space

In [14]:
name = []

In [15]:
def plot_cm_roc(reg):
    
    # Print confusion matrix to evaluate classification accuracy
    plot_confusion_matrix(reg, X_train, y_train, 
                          cmap=plt.cm.Blues,
                          colorbar=False,
                          normalize='true')
    plt.title("Confusion matrix - Training data")
    plt.show() 

    plot_confusion_matrix(reg, X_val, y_val,
                          cmap=plt.cm.Blues,
                          colorbar=False,
                          normalize='true')
    plt.title("Confusion matrix - Validation data")
    plt.show() 
    
    # Calculate and plot ROC_AUC
    y_score = reg.predict_proba(X_val)[:,1]
    roc_auc_val = roc_auc_score(y_val, y_score)
    print("ROC_AUC:", roc_auc_val)
    reg_disp = plot_roc_curve(reg, X_val, y_val)
    
    #return reg_disp


In [48]:
def fit_predict_save(reg, name):
    
    # Fit classifier
    reg.fit(X_train, y_train)
    print("Classifier fit complete.")
    
    # Model predictions on training and validation data
    y_train_pred = reg.predict(X_train)
    y_val_pred = reg.predict(X_val)
    print("Model predictions complete.")
    
    # Save fitted model into model folder
    save_path = "'../../models/David/'"+name+"'.joblib'"
    dump(reg,  save_path)

## Experimentation

### Linear SVC

In [49]:
from sklearn.svm import SVC

reg = SVC(kernel = 'linear', probability=True)

#linear_svc.fit(X_train, y_train)

In [50]:
fit_predict_save(reg, 'linear_svc')

Classifier fit complete.
Model predictions complete.


FileNotFoundError: [Errno 2] No such file or directory: "'../../models/David/'linear_svc'.joblib'"

In [None]:
# Save fitted model into model folder
dump(linear_svc,  '../../models/David/linear_svc.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = linear_svc.predict(X_train)
y_val_pred = linear_svc.predict(X_val)

In [None]:
plot_cm_roc(linear_svc)

### K Neighbours

In [None]:
# Create and fit classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10, weights='distance')
neigh.fit(X_train_res, y_train_res)

In [None]:
# Save fitted model into model folder
dump(neigh,  '../../models/David/neigh_res.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = neigh.predict(X_train)
y_val_pred = neigh.predict(X_val)

In [None]:
plot_cm_roc(neigh)

### Logistic Regression - default

In [None]:
# Create and fit classifier
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(log_reg,  '../../models/David/log_reg.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = log_reg.predict(X_train)
y_val_pred = log_reg.predict(X_val)

In [None]:
plot_cm_roc(log_reg)

### Logistic Regression - ElasticNet

In [None]:
# Create and fit classifier
from sklearn.linear_model import LogisticRegression 

log_reg_elastic = LogisticRegression(
    penalty='elasticnet', 
    solver='saga', 
    l1_ratio=0.5)

log_reg_elastic.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(log_reg_elastic,  '../../models/David/log_reg_elastic.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = log_reg_elastic.predict(X_train)
y_val_pred = log_reg_elastic.predict(X_val)

In [None]:
plot_cm_roc(log_reg_elastic)

### Logistic Regression CV

In [None]:
# Create and fit classifier
from sklearn.linear_model import LogisticRegressionCV 

C_list = np.linspace(0.001, 0.5, 20)

log_reg_cv = LogisticRegressionCV(
    Cs=C_list, 
    cv=20,
    penalty='l1',
    scoring='roc_auc', 
    solver='liblinear',
    tol=1e-4, 
    max_iter=1000, 
    class_weight='balanced', 
    n_jobs=10, #7
    verbose=2, 
    refit=True, 
    multi_class='ovr', 
    random_state=42)

log_reg_cv.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(log_reg_cv,  '../../models/David/log_reg_cv.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = log_reg_cv.predict(X_train)
y_val_pred = log_reg_cv.predict(X_val)

In [None]:
plot_cm_roc(log_reg_cv)

### Random Forest - default

In [None]:
# Create and fit classifier
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(random_forest,  '../../models/David/random_forest.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = random_forest.predict(X_train)
y_val_pred = random_forest.predict(X_val)

In [None]:
plot_cm_roc(random_forest)

### Random Forest - second attempt

In [None]:
# Create and fit classifier
from sklearn.ensemble import RandomForestClassifier

random_forest2 = RandomForestClassifier(
    n_estimators=5, 
    criterion='entropy', 
    max_depth=15, 
    min_samples_split=3, 
    random_state=42, 
    verbose=1, 
    class_weight='balanced' 
)

random_forest2.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(random_forest2,  '../../models/David/random_forest2.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = random_forest2.predict(X_train)
y_val_pred = random_forest2.predict(X_val)

In [None]:
plot_cm_roc(random_forest2)

### SVC - default

In [None]:
# Create and fit classifier
from sklearn.svm import SVC

svm = SVC(kernel='sigmoid', probability=True)

svm.fit(X_train, y_train)

In [None]:
# Save fitted model into model folder
dump(svm,  '../../models/David/svm.joblib')

In [None]:
# Model predictions on training and validation data
y_train_pred = svm.predict(X_train)
y_val_pred = svm.predict(X_val)

In [None]:
plot_cm_roc(svm)

## Calculate and export test data predictions

In [None]:
# Predict target probabilities (use specific model name)
test_probs = random_forest2.predict_proba(X_test)[:,1]

In [None]:
# Create dataframe object
test_probs_df = pd.DataFrame(test_probs, columns = ["TARGET_5Yrs"])

# Name 'ID' column
test_probs_df.index.name = "Id"

In [None]:
# Save output to csv
test_probs_df.to_csv("final2.csv")