In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [34]:
filepath_1 = '/Users/eliasrapkin-siles/Downloads/Machine Learning 1/Project/22Q2 Files/CRISPR_gene_effect.csv'
CRISPR_gene_effect = pd.read_csv(filepath_1)

filepath_2 = '/Users/eliasrapkin-siles/Downloads/Machine Learning 1/Project/22Q2 Files/CRISPR_gene_dependency.csv'
CRISPR_gene_dependency = pd.read_csv(filepath_2)

filepath_3 = '/Users/eliasrapkin-siles/Downloads/Machine Learning 1/Project/22Q2 Files/CCLE_expression.csv'
CCLE_expression = pd.read_csv(filepath_3)

filepath_4 = '/Users/eliasrapkin-siles/Downloads/Machine Learning 1/Project/22Q2 Files/sample_info.csv'
sample_info = pd.read_csv(filepath_4)

In [35]:
sample_info = sample_info[['DepMap_ID', 'sex', 'sample_collection_site', 'primary_disease', 'Subtype', 'age', 'lineage', 'lineage_subtype',
                           'lineage_sub_subtype','Cellosaurus_NCIt_disease']]

# Define values to exclude
values_to_exclude = ['Adrenal Cancer', 'Teratoma', 'Embryonal Cancer', 'Unknown']

In [36]:
CRISPR_gene_effect_master_df = pd.merge(sample_info, CRISPR_gene_effect, on="DepMap_ID", how="outer")
CRISPR_gene_dependency_master_df = pd.merge(sample_info, CRISPR_gene_dependency, on="DepMap_ID", how="outer")
CCLE_expression_master_df = pd.merge(sample_info, CCLE_expression, on="DepMap_ID", how="outer")

In [37]:
CRISPR_gene_effect_master_df = CRISPR_gene_effect_master_df.dropna()
CRISPR_gene_effect_master_df = CRISPR_gene_effect_master_df[~CRISPR_gene_effect_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CRISPR_gene_effect_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CRISPR_gene_effect_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.25
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.71      0.20         7
           1       1.00      0.10      0.18        10
           2       0.36      0.31      0.33        13
           3       0.00      0.00      0.00         5
           4       0.71      0.38      0.50        13
           5       0.62      0.33      0.43        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.08      0.25      0.12         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1

    accuracy                           0.2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

# Use the best model to predict
best_knn = grid_search.best_estimator_
y_pred_optimized = best_knn.predict(X_test)

# Evaluate the optimized model
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized Accuracy: {accuracy_optimized:.2f}")
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred_optimized))




Best parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Best cross-validation accuracy: 0.30
Optimized Accuracy: 0.30
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.86      0.27         7
           1       1.00      0.10      0.18        10
           2       0.22      0.31      0.26        13
           3       0.00      0.00      0.00         5
           4       0.78      0.54      0.64        13
           5       0.60      0.40      0.48        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.12      0.25      0.17         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
CRISPR_gene_effect_master_df = CRISPR_gene_effect_master_df.dropna()
CRISPR_gene_effect_master_df = CRISPR_gene_effect_master_df[~CRISPR_gene_effect_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CRISPR_gene_effect_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CRISPR_gene_effect_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7, metric='manhattan', weights='distance')  # You can adjust n_neighbors
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.30
Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.86      0.27         7
           1       1.00      0.10      0.18        10
           2       0.22      0.31      0.26        13
           3       0.00      0.00      0.00         5
           4       0.78      0.54      0.64        13
           5       0.60      0.40      0.48        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.12      0.25      0.17         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1

    accuracy                           0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
CRISPR_gene_dependency_master_df = CRISPR_gene_effect_master_df.dropna()
CRISPR_gene_dependency_master_df = CRISPR_gene_dependency_master_df[~CRISPR_gene_dependency_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CRISPR_gene_dependency_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CRISPR_gene_dependency_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.25
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.71      0.20         7
           1       1.00      0.10      0.18        10
           2       0.36      0.31      0.33        13
           3       0.00      0.00      0.00         5
           4       0.71      0.38      0.50        13
           5       0.62      0.33      0.43        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.08      0.25      0.12         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1

    accuracy                           0.2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

# Use the best model to predict
best_knn = grid_search.best_estimator_
y_pred_optimized = best_knn.predict(X_test)

# Evaluate the optimized model
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized Accuracy: {accuracy_optimized:.2f}")
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred_optimized))




Best parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Best cross-validation accuracy: 0.30
Optimized Accuracy: 0.30
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.86      0.27         7
           1       1.00      0.10      0.18        10
           2       0.22      0.31      0.26        13
           3       0.00      0.00      0.00         5
           4       0.78      0.54      0.64        13
           5       0.60      0.40      0.48        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.12      0.25      0.17         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
CRISPR_gene_dependency_master_df = CRISPR_gene_effect_master_df.dropna()
CRISPR_gene_dependency_master_df = CRISPR_gene_dependency_master_df[~CRISPR_gene_dependency_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CRISPR_gene_dependency_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CRISPR_gene_dependency_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan')
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.30
Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.86      0.27         7
           1       1.00      0.10      0.18        10
           2       0.22      0.31      0.26        13
           3       0.00      0.00      0.00         5
           4       0.78      0.54      0.64        13
           5       0.60      0.40      0.48        15
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         3
          11       0.12      0.25      0.17         4
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1

    accuracy                           0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
CCLE_expression_master_df = CCLE_expression_master_df.dropna()
CCLE_expression_master_df = CCLE_expression_master_df[~CCLE_expression_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CCLE_expression_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CCLE_expression_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.92      0.67        12
           1       0.81      0.81      0.81        16
           2       0.73      0.66      0.69        29
           3       0.80      0.40      0.53        10
           4       0.33      0.50      0.40         6
           5       0.85      0.85      0.85        13
           7       0.80      0.80      0.80        15
           9       1.00      1.00      1.00         3
          11       1.00      0.50      0.67         2
          12       0.60      1.00      0.75         3
          13       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3

    accuracy                           0.70       114
   macro avg       0.57      0.57      0.55       114
weighted avg       0.70      0.70      0.69       114



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

# Use the best model to predict
best_knn = grid_search.best_estimator_
y_pred_optimized = best_knn.predict(X_test)

# Evaluate the optimized model
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized Accuracy: {accuracy_optimized:.2f}")
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred_optimized))




Best parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Best cross-validation accuracy: 0.69
Optimized Accuracy: 0.71
Optimized Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.83      0.71        12
           1       0.87      0.81      0.84        16
           2       0.76      0.66      0.70        29
           3       0.67      0.40      0.50        10
           4       0.27      0.50      0.35         6
           5       0.80      0.92      0.86        13
           7       0.81      0.87      0.84        15
           9       1.00      1.00      1.00         3
          11       1.00      0.50      0.67         2
          12       0.60      1.00      0.75         3
          13       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3

    accuracy                           0.71       114
   macro av

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
CCLE_expression_master_df = CCLE_expression_master_df.dropna()
CCLE_expression_master_df = CCLE_expression_master_df[~CCLE_expression_master_df['primary_disease'].isin(values_to_exclude)]

# Select features and target
X = CCLE_expression_master_df.drop(['DepMap_ID', 'primary_disease'], axis=1).select_dtypes(include=np.number)
y = CCLE_expression_master_df['primary_disease']

# Encode the target variable
y = pd.factorize(y)[0]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the KNN classifier
knn = KNeighborsClassifier(metric= 'euclidean', n_neighbors= 5, weights= 'distance')
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.71
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.83      0.71        12
           1       0.87      0.81      0.84        16
           2       0.76      0.66      0.70        29
           3       0.67      0.40      0.50        10
           4       0.27      0.50      0.35         6
           5       0.80      0.92      0.86        13
           7       0.81      0.87      0.84        15
           9       1.00      1.00      1.00         3
          11       1.00      0.50      0.67         2
          12       0.60      1.00      0.75         3
          13       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3

    accuracy                           0.71       114
   macro avg       0.57      0.58      0.56       114
weighted avg       0.71      0.71      0.70       114



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
