# Learning from Big Data: Module 1 - Natural Language Processing

#### Session 1 - Getting familiar with Supervised Learning

In [158]:
# Loading packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## Breast Cancer Dataset
For this example, we will use the breast cancer dataset.

### 1. Loading the Data

In [159]:
# Here, we load the breast cancer dataset from sklearn
data = load_breast_cancer()

# Printing the feature names
print(f"Feature names:\n {data.feature_names}\n")

# Printing the target names (of response variable)
print(f"Target names:\n {data.target_names}")

Feature names:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Target names:
 ['malignant' 'benign']


#### Extracting the features from the data

In [160]:
# The features are stored in data.data
X = pd.DataFrame(data.data, columns=data.feature_names)

# Printing the dimensions:
print(f"There are a total of {X.shape[0]} observations (rows)")
print(f"There are a total of {X.shape[1]} features (columns) \n")

# First 5 rows of the features:
#print(f"{X.head()}\n")

# Printing information:
print(X.info())

There are a total of 569 observations (rows)
There are a total of 30 features (columns) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    flo

#### Extracting the target variable from the data

In [161]:
# Looking at the target variable: ['malignent', 'benign'] -> [0, 1]
print(f"Classes: {data.target_names}\n")

# The response variable is stored in data.target
y = pd.DataFrame(data.target, columns=["target"])

# Printing the total number of observations:
print(f"There are a total of {y.shape[0]} observations (rows)\n")

# First 5 observations:
print(f"{y.head()}\n")

Classes: ['malignant' 'benign']

There are a total of 569 observations (rows)

   target
0       0
1       0
2       0
3       0
4       0



### 2. Prepping the data: Training and Test sets

In [162]:
# Combining the X and y datasets
df = pd.concat([X, y], axis=1)

# Separating majority and minority classes
df_majority = df[df["target"]==1]
df_minority = df[df["target"]==0]

# The two classes have a large difference in the nubmer of cases
print(f"Number of cases for benign: {df_majority.shape[0]}")
print(f"Number of cases for malignent: {df_minority.shape[0]}")

Number of cases for benign: 357
Number of cases for malignent: 212


**Note** that the two classes have a large difference in the number of cases (357 versus 212). 
The author of this example dataset and code suggest using "down sample" or "up sample" to 
have the same number of cases. 
However, this can be a problematic approach. There are other approaches in the literature.
Furthermore, over-representation and under-representation of a group in a dataset is an
active area of research within the "fairness in machine learning (FML)" field, where the impact of
underrepresented minorities in training is a source of concern. For more and better solutions I recommend
exploring that FML literature and methods. This could also be a nice research question for your final assignment in module 1.
For now, I am keeping the original author's proposed method below (i.e., down sample) but please be aware of the statistical and fairness issues mentioned above.

In [163]:
# Setting random seed for reproducibility
np.random.seed(42)

# Downsampling majority class
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)

# Combining minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Displaying new class counts
print(f"{df_downsampled['target'].value_counts()}\n")

# Splitting the data into train and test sets
train_data, test_data = train_test_split(df_downsampled, test_size=0.3, random_state=42)

# Class distribution of train data
print(f"{train_data['target'].value_counts()}")

target
1    212
0    212
Name: count, dtype: int64

target
0    154
1    142
Name: count, dtype: int64


### 3. Building and Fitting a Logistic Model

In [164]:
# Setting random seed for reproducibility
np.random.seed(42)

# Defining the feature columns
feature_cols = ["mean radius", "mean texture", "mean perimeter"]

# Adding an intercept to the feature data
train_data['intercept'] = 1
test_data['intercept'] = 1

# Defining the logistic regression model
logitmod = sm.Logit(train_data["target"], train_data[feature_cols + ['intercept']])

# Fitting the model
result = logitmod.fit()

# Sumary of the model
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.199170
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  296
Model:                          Logit   Df Residuals:                      292
Method:                           MLE   Df Model:                            3
Date:                Tue, 04 Jul 2023   Pseudo R-squ.:                  0.7123
Time:                        20:31:29   Log-Likelihood:                -58.954
converged:                       True   LL-Null:                       -204.93
Covariance Type:            nonrobust   LLR p-value:                 5.500e-63
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
mean radius        8.3179      1.562      5.327      0.000       5.257      11.378
mean texture     

### 4. Making the prediction on the test data

In [165]:
# Make predictions on the test data
pred = result.predict(test_data[feature_cols + ['intercept']])

#### The prediction is continuous, we now discretize into a binary variable (benign or malignant).
**What is the pros and cons of doing this:**
+ From a machine learning development point of view? 
+ From a patient/clinician point of view?

In [166]:
# Converting continuous predictions to binary predictions
y_pred_num = (pred > 0.5).astype(int)

# Accuracy of approximately 91%
print(accuracy_score(test_data["target"], y_pred_num))

0.90625
