# Algorithmic Bias - Core Code
Some code to get started on the Algorithmic Bias assignment. 

In [None]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
bcDB = datasets.load_breast_cancer()

In [None]:
bcDF = pd.DataFrame(bcDB.data, columns= list(bcDB['feature_names']))
bcDF['target'] = pd.Series(bcDB.target)
bcDF = bcDF.sort_values(by = ['target'])
bcDF = bcDF.reset_index(drop=True)
bcDF.head(5)

In [None]:
vc = bcDF['target'].value_counts()
for i,j in enumerate(bcDB.target_names):
    print (vc[i],j)

In [None]:
y = bcDF.pop('target').values
X = bcDF.values
X.shape, y.shape

## $k$-NN
### hold-out testing
Malignant is the minority class at ~40%.  
$k$-NN classifier picks up this under-representation and accentuates it,  
predicting just 36% malignant by hold-out testing method.
The mean accuracy of this method is about 91.61%.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

kNN = KNeighborsClassifier(n_neighbors=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
y_pred = kNN.fit(X_train, y_train).predict(X_test)
print(X_train.shape,X_test.shape)

y_test.sum()/len(y_test)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
model = kNN.fit(X_train, y_train)
result = model.score(X_test,y_test)
print("The mean accuracy of this model is: %0.2f%%" % (result*100))


### Cross-validation testing
Use cross calidation way with 5 folds set.
and the mean accuracy of the method is about 93.41%

In [None]:
from sklearn.model_selection import cross_val_score
y_pred = cross_val_score(kNN, X, y, cv=5, scoring='f1')
print(np.mean(y_pred))

## Decision Tree
### hold-out testing
predicting 39% malignant(sometimes 41%) by hold-out testing method. The mean accuracy of this model is: 92.31%.

In [None]:
from graphviz import Source
from sklearn.tree import DecisionTreeClassifier, export_graphviz

dtree = DecisionTreeClassifier(criterion='entropy')
tree_graph = dtree.fit(X_train,y_train)
y_pred = tree_graph.predict(X_test)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

result = tree_graph.score(X_test,y_test)
print("The mean accuracy of this model is: %0.2f%%" % (result*100))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2,test_size = 0.10)
result = tree_graph.score(X_test,y_test)
print("The mean accuracy of this model is: %0.2f%%" % (result*100))

### Cross-validation testing
Use cross calidation way with 5 folds set.
and the mean accuracy of this method is about 94.57%

In [None]:
y_pred = cross_val_score(dtree, X, y, cv=5, scoring='f1')
np.mean(y_pred)

The process of decision tree shows below 

In [None]:
tree_im = export_graphviz(tree_graph, out_file=None, 
                      feature_names=bcDB.feature_names,  
                      class_names=bcDB.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = Source(tree_im)  
graph

## Logistic Regression
### hold-out testing
predicting 37% malignant by hold-out testing method. The mean accuracy of this model is 94.41%

In [None]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression().fit(X_train,y_train)
y_pred = LR_clf.predict(X_test)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
result = LR_clf.score(X_test,y_test)
print("The mean accuracy of this model is: %0.2f%%" % (result*100))

### Cross-validation testing
Use cross calidation way with 5 folds set.
and the mean accuracy of this method is 96.13%

In [None]:
y_pred = cross_val_score( LogisticRegression(), X, y, cv=5, scoring='f1')
np.mean(y_pred)

## Navie Bayes
### hold-out testing
Using the Gaussian Navie Bayes model for hold-out testing and cross-validation testing.
Predicting 39% malignant by hold-out testing method.The mean accuracy of this model is: 88.11%

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

gnb = GaussianNB().fit(X_train,y_train)
y_pred = gnb.predict(X_test)
print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
result = gnb.score(X_test,y_test)
print("The mean accuracy of this model is: %0.2f%%" % (result*100))

### Cross-validation testing
Use cross calidation way with 5 folds set.
and the mean accuracy of this method is about 96.13%

In [None]:
y_pred = cross_val_score(GaussianNB(), X, y, cv=5, scoring='f1')
np.mean(y_pred)

### 2. The strategy to rectify the bias.
The inbalanced data leads to the bias. Also a Balanced Accuracy measurement can reduce the bias.
For example, f1-measure can reduce the bias and get a balanced accuracy.

In [None]:
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
kNN = KNeighborsClassifier(n_neighbors=3)  
dtree = DecisionTreeClassifier(criterion='entropy')
LR_clf = LogisticRegression()
mnb = GaussianNB()
models = [kNN,dtree,LR_clf,mnb]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
for m in models:
    mm = m.fit(X_train, y_train)
    y_pred = mm.predict(X_test)
    f1s = f1_score(y_test, y_pred) 
    print("F1 Score on Test set {:22} {:.4f}".format(type(m).__name__, f1s))

The result shows the accuracy score of each method is increased with hold out testing.

### 3. The second data set to test
The second dataset is HotelRevHelpfulnessV2.csv.
Using hold-out testing and calculate the scores of the model.

In [None]:
hotel = pd.read_csv('HotelRevHelpfulnessV2.csv')
y = hotel.pop('reviewHelpfulness').values
X = hotel.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=1/3)

In [None]:
for m in models:
    mm = m.fit(X_train, y_train)
    sc = m.score(X_test,y_test) 
    print("Model Score on Test set {:22} {:.4f}".format(type(m).__name__, sc))

Using f1 measure to reduce the bias and get a more balanced accuracy.

In [None]:
for m in models:
    mm = m.fit(X_train, y_train)
    y_pred = mm.predict(X_test)
    f1s = f1_score(y_test, y_pred) 
    print("F1 Score on Test set {:22} {:.4f}".format(type(m).__name__, f1s))