In [None]:
## CS1.1 HISTOGRAM ##
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from sklearn import datasets
def display_histogram(data,  feature_name, title_name='default'):
    plt.hist(data)
    plt.ylabel(feature_name)
    plt.xlabel('No. of patients')
    plt.grid(True)
    plt.show()
    
bunchobject = datasets.load_breast_cancer()
feature_range = [0]
data_subset = bunchobject.data[:,feature_range]
feature_name = bunchobject.feature_names[feature_range]
display_histogram(data_subset,feature_name)

In [None]:
## CS1.2 SCATTER PLOT ##
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from sklearn import datasets
def display_scatter(x,y, xlabel='x', ylabel='y',title_name ='default'):
    plt.scatter(x,y)
    plt.grid(True)
    plt.show()
x_index = 0
y_index = 3
x = bunchobject.data[:,x_index]
y = bunchobject.data[:,y_index]
x_label = bunchobject.feature_names[x_index]
y_label = bunchobject.feature_names[y_index]
display_scatter(x,y,x_label,y_label)

In [None]:
## CS1.3 BAR ##
import matplotlib.pyplot as plt
from sklearn import datasets
def display_bar_chart(positions, counts, names, title_name='default' ):
    plt.bar(positions,counts)
    plt.show()
unique, counts = np.unique(bunchobject.target, return_counts = True)
display_bar_chart(unique, counts, bunchobject.target_names)

In [None]:
## CS2 5 NUMBER SUMMARY ##
import numpy as np
def five_number_summary(X):
  ans = []
  for i in range(len(X[0])):    
    x = [col[i] for col in X]
    dict_keys = ['minimum','first quartile','median','third quartile','maximum']
    dict_values = [np.min(x),np.percentile(x,25),np.percentile(x,50),np.percentile(x,75),np.max(x)]
    ans.append(dict(zip(dict_keys,dict_values)))
  return ans  

In [None]:
## CS3 NORMALIZE MIN-MAX ##
#import sklearn.preprocessing as pp
#normalize_minmax = lambda data: pp.minmax_scale(data)
import numpy as np
def normalize_minmax(data):
    output = [[] for _ in range(len(data))]
    for i in range(len(data[0])):
        data_slice = data[:, i]
        ptp = max(data_slice) - min(data_slice)         
        minimum = min(data_slice)
        for j, slice_element in enumerate(data_slice):
            output[j].append((slice_element - minimum) / ptp)
    return np.array(output)  

In [None]:
## CS4 KNN ##
from sklearn.model_selection import train_test_split 
from sklearn import neighbors, datasets
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn.preprocessing as pp

normalize_minmax = lambda data: pp.minmax_scale(data)  

def get_metrics(actual_targets, predicted_targets):
 c_matrix = confusion_matrix(actual_targets, predicted_targets)
 T_rec = np.sum(c_matrix)
 acc = np.trace(c_matrix)/T_rec
 sen = c_matrix[1][1]/np.sum(c_matrix[1])
 fp = c_matrix[0][1]/np.sum(c_matrix[0])
 dict_key = ['confusion matrix','total records','accuracy','sensitivity','false positive rate']
 dict_values = [c_matrix,T_rec,acc,sen,fp]
 return dict(zip(dict_key,[np.round(i,3) for i in dict_values]))

def knn_classifier(bunchobject, feature_list, size, seed , k ): 
    data = bunchobject.data[:, feature_list]
    target = bunchobject.target
    data = normalize_minmax(data)
    target = normalize_minmax(target)
    data_train, data_test, target_train, target_test = train_test_split(data , target , test_size = size, random_state = seed )
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    clf.fit(data_train, target_train)
    target_predicted = clf.predict(data_test)
    return get_metrics(target_test, target_predicted)

In [1]:
## CS5 LINEAR REGRESSION ##
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split 
import numpy as np
bunchobject= datasets.load_breast_cancer()

def linear_regression(bunchobject, x_index, y_index, size, seed):
    x = bunchobject.data[:,x_index]
    y = bunchobject.data[:,y_index]
    x_train, x_test, y_train, y_test = train_test_split( x , y , test_size= size, random_state = seed )
    regr = linear_model.LinearRegression()
    regr.fit(x_train[:, None], y_train[:, None])
    y_pred = regr.predict(x_test[:, None])
    dict_key = ['coefficients','intercept','mean squared error','r2 score']
    dict_values = [regr.coef_,regr.intercept_,mean_squared_error(y_test[:, None],y_pred),r2_score(y_test[:, None],y_pred)]
    return x_train[:, None], y_train[:, None], x_test[:, None], y_pred, dict(zip(dict_key,dict_values))

In [33]:
## CS6 MULTIPLE LINEAR REGRESSION ##
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
bunchobject= datasets.load_breast_cancer()

def multiple_linear_regression(bunchobject, x_index, y_index, order, size, seed):
    x = bunchobject.data[:,np.newaxis,x_index]
    y = bunchobject.data[:,np.newaxis,y_index]
    poly = PolynomialFeatures(order,include_bias=False)
    c = poly.fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split( c , y , test_size= size, random_state = seed )
    regr = linear_model.LinearRegression()
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    dict_key = ['coefficients','intercept','mean squared error','r2 score']
    dict_values = [regr.coef_,regr.intercept_,mean_squared_error(y_test,y_pred),r2_score(y_test,y_pred)]
    return x_train[:,0], y_train, x_test[:,0], y_pred, dict(zip(dict_key,dict_values))

In [None]:
## CS7 KNN FULL ##
from sklearn.model_selection import train_test_split 
from sklearn import neighbors, datasets
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn.preprocessing as pp
bunchobject= datasets.load_breast_cancer()

normalize_minmax = lambda data: pp.minmax_scale(data)  

def get_metrics(actual_targets, predicted_targets):
 c_matrix = confusion_matrix(actual_targets, predicted_targets)
 T_rec = np.sum(c_matrix)
 acc = np.trace(c_matrix)/T_rec
 sen = c_matrix[1][1]/np.sum(c_matrix[1])
 fp = c_matrix[0][1]/np.sum(c_matrix[0])
 dict_key = ['confusion matrix','total records','accuracy','sensitivity','false positive rate']
 dict_values = [c_matrix,T_rec,acc,sen,fp]
 return dict(zip(dict_key,[np.round(i,3) for i in dict_values]))

def acc(a,b):
    c_matrix = confusion_matrix(a, b)
    T_rec = np.sum(c_matrix)
    acc = np.trace(c_matrix)/T_rec
    return acc

def knn_classifier_full(bunchobject, feature_list, size, seed): 
    data = bunchobject.data[:, feature_list]
    target = bunchobject.target
    data = normalize_minmax(data)
    target = normalize_minmax(target)
    data_train, data_part2, target_train, target_part2 = train_test_split(data , target , test_size = size, random_state = seed )
    data_valid, data_test, target_valid, target_test = train_test_split(data_part2 , target_part2 , test_size = 0.5, random_state = seed )
    acc_list =[]
    val_list = []
    for k in range(1,20):
     clf = neighbors.KNeighborsClassifier(n_neighbors=k)
     clf.fit(data_train, target_train)
     target_predicted = clf.predict(data_valid)
     acc_list.append(acc(target_valid, target_predicted))
     val_list.append((target_valid, target_predicted))   
    K = acc_list.index(np.max(acc_list))+1
    clf = neighbors.KNeighborsClassifier(n_neighbors=K)
    clf.fit(data_train, target_train)
    target_predicted = clf.predict(data_test)   
    val, val_pred = val_list[K-1]
    return {'best k':K, 'validation set': get_metrics(val,val_pred) , 'test set':  get_metrics(target_test, target_predicted)}            