# The midterm report and 5-minute presentation should include the following.

- Preliminary visualizations of data.
- Detailed description of data processing done so far.
- Detailed description of data modeling methods used so far.
- Preliminary results. (e.g. we fit a linear model to the data and we achieve promising results, or we did some clustering and we notice a clear pattern in the data)

We expect to see preliminary code in your project repo at this point.

Your report should be submitted as README.md in your project GitHub repo.

The 5-minute presentation should be a recording uploaded to YouTube. Please add the video link to the beginning of your report.

# Visualizations of Data

In [26]:
import pandas as pd

# Load the TSV Data File
# df = pd.read_csv('./data/aml_ohsu_2022_clinical_data.tsv', sep='\t') # load the data (tsv file so need to specify sep)
df = pd.read_csv('./data/breast_cancer.csv') # load the data 


## Data Analysis

In [27]:
# 1. Check basic info of the data
print("Data Shape:", df.shape) # check the shape of the data
df.info() # check the basic info of the data

print()
# 2. Check the first few rows of the data
print("First 5 rows:")
df.head() # check the first few rows of the data


Data Shape: (569, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  peri

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [28]:
# 3. Check the statistic of the data
df.describe() # only numeric columns
df.describe(include='all') # include all columns

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
unique,,2,,,,,,,,,...,,,,,,,,,,
top,,B,,,,,,,,,...,,,,,,,,,,
freq,,357,,,,,,,,,...,,,,,,,,,,
mean,30371830.0,,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,125020600.0,,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,8670.0,,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,869218.0,,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,906024.0,,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,8813129.0,,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,


## Missing Values

In [29]:
# 4. Check the missing values
missing_values = pd.DataFrame({
    'Num of Missing Values' : df.isnull().sum(), # check the missing values
    'Percentage of Missing Values' : (df.isnull().sum()/len(df) * 100).round(2) # check the percentage of missing values
})
display(missing_values)

# Filter to show columns with missing value percentage greater than a certain threshold (e.g., 20%)
high_missing_values = missing_values[missing_values['Percentage of Missing Values'] > 20]
display(high_missing_values)

print("Note: 'TP53 Pathway' Column has over 90% missing data")

# 5. Check the unique values
df.duplicated().sum() # check the number of duplicated rows
df.duplicated().sum()/len(df) * 100 # check the percentage of duplicated rows

# 6. Check the unique values in each column
df.columns # check the columns of the data

Unnamed: 0,Num of Missing Values,Percentage of Missing Values
id,0,0.0
diagnosis,0,0.0
radius_mean,0,0.0
texture_mean,0,0.0
perimeter_mean,0,0.0
area_mean,0,0.0
smoothness_mean,0,0.0
compactness_mean,0,0.0
concavity_mean,0,0.0
concave points_mean,0,0.0


Unnamed: 0,Num of Missing Values,Percentage of Missing Values
Unnamed: 32,569,100.0


Note: 'TP53 Pathway' Column has over 90% missing data


Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

# Histogram
df['column_name'].hist()

# Boxplot
sns.boxplot(data=df, x='column_name')

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# Scatterplot
sns.scatterplot(data=df, x='column1', y='column2')

# Distribution plot
sns.distplot(df['column_name'])


KeyError: 'column_name'