In [1]:
# import libraries 
import pandas as pd
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import os 
import zipfile

In [2]:
### Unzip data if needed 



# # Define paths for each zip file
# zip_files = {
#     "dataset": "FullDataset/bondugula_JDO_20230125_SLIM.csv.zip",
#     "notebook1": "FullDataset/full-data-nb.ipynb.zip",
#     "notebook2": "FullDataset/full-data-practice-041223.ipynb.zip",
#     "notebook3": "FullDataset/full-data-practice-061023.ipynb.zip",
#     "notebook4": "FullDataset/full-data-practice-20230225.ipynb.zip",
#     "unknown_csv": "FullDataset/s8-acetyl+sirt-output.csv.zip"
# }

# # Function to unzip files
# def unzip_file(zip_path, extract_to):
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         zip_ref.extractall(extract_to)
#     print(f"Extracted {zip_path} to {extract_to}")

# # Directory to extract all files
# extract_dir = "Unzipped_Files"

# # Create the extraction directory if it doesn't exist
# os.makedirs(extract_dir, exist_ok=True)

# # Unzip each file
# for name, path in zip_files.items():
#     output_folder = os.path.join(extract_dir, name)  # Folder based on each file's label
#     os.makedirs(output_folder, exist_ok=True)
#     unzip_file(path, output_folder)

# View dataset

In [3]:
# Define path to data and define it
file_path = "FullDataset/unzipped_data/dataset/bondugula_JDO_20230125_SLIM.csv"
data = pd.read_csv(file_path, index_col=0)

# View data
print(data.columns)
data.head(5)

Index(['Residue', 'E6', 'E20', 'Protein', 'No.', 'Res', 'isUnstruct', 'E6.1',
       'E20.1', 'E22', 'Vkbat', 'chou_fasman', 'sspro_5', 'gor4', 'dsc',
       'jnet', 'psipred', '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U',
       'ProteinID'],
      dtype='object')


Unnamed: 0,Residue,E6,E20,Protein,No.,Res,isUnstruct,E6.1,E20.1,E22,...,gor4,dsc,jnet,psipred,# homologues,HAS_H,HAS_S,HAS_O,HAS_U,ProteinID
0,D,0.926212,0.926212,2BDE,1,D,0.954544,,,,...,Other,Other,Other,Other,6,0,0,0,1,2BDE_0
1,T,1.307625,2.024379,2BDE,2,T,0.805483,,,,...,Other,Other,Other,Other,6,0,0,0,1,2BDE_0
2,H,1.321518,1.66809,2BDE,3,H,0.654102,,,,...,Other,Other,Other,Helix,6,0,0,0,1,2BDE_0
3,K,0.528085,0.845275,2BDE,4,K,0.5016,,,,...,Other,Other,Other,Helix,6,0,0,1,1,2BDE_0
4,V,0.0,0.583619,2BDE,5,V,0.280561,,,,...,Sheet,Sheet,Sheet,Helix,6,0,1,0,1,2BDE_0


We know the descriptors are:
* E6 (?)
* E20 (?)
* isUnstruct (Disorder Propensity)

The target that we need to define is the switch. This occurs when HAS_H, HAS_S, HAS_O, HAS_U added together is greater than 1. We need to make this column

In [4]:
# Create a copy dataframe of region of interest 
switch_det = data.iloc[:, -5:-1]

# Define new column that detects the switch occuring 
switch_det['switch'] = (switch_det.sum(axis=1) > 1).astype(int) # Create binar column, True = 1 and False = 0

# Print values
switch_det.head(3)

Unnamed: 0,HAS_H,HAS_S,HAS_O,HAS_U,switch
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0


In [5]:
print(switch_det['switch'].value_counts())

switch
0    914237
1    179049
Name: count, dtype: int64


In [6]:
# Add our switch column to the main one 
data2 = data.copy()

data2 = pd.concat([data2, switch_det['switch']], axis=1)
data2.head(5)

Unnamed: 0,Residue,E6,E20,Protein,No.,Res,isUnstruct,E6.1,E20.1,E22,...,dsc,jnet,psipred,# homologues,HAS_H,HAS_S,HAS_O,HAS_U,ProteinID,switch
0,D,0.926212,0.926212,2BDE,1,D,0.954544,,,,...,Other,Other,Other,6,0,0,0,1,2BDE_0,0
1,T,1.307625,2.024379,2BDE,2,T,0.805483,,,,...,Other,Other,Other,6,0,0,0,1,2BDE_0,0
2,H,1.321518,1.66809,2BDE,3,H,0.654102,,,,...,Other,Other,Helix,6,0,0,0,1,2BDE_0,0
3,K,0.528085,0.845275,2BDE,4,K,0.5016,,,,...,Other,Other,Helix,6,0,0,1,1,2BDE_0,1
4,V,0.0,0.583619,2BDE,5,V,0.280561,,,,...,Sheet,Sheet,Helix,6,0,1,0,1,2BDE_0,1


# Lets check our columns have valid data

In [10]:
data3 = data2[["E6", "E20", "isUnstruct", "switch"]]

# See if there is any missing values 
# Check if there are any missing values in the feature matrix (X_train)
print("Missing data:")
print(data3.isnull().sum())

Missing data:
E6            3710
E20           3710
isUnstruct       0
switch           0
dtype: int64


In [11]:
# Drop those missing data values
data3_cleaned = data3.dropna()

In [None]:
# Now lets define features and target 
X = data3_cleaned[["E6", "E20", "isUnstruct"]]
y = data3_cleaned["switch"]

# Add intercept to the model
X = sm.add_constant(X)

# Create Train test split based on features and target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42, shuffle=True)

# Create Logistic model
model = sm.Logit(y_train, X_train)
result = model.fit()

# Review summary of training
print(result.summary())
print()

Optimization terminated successfully.
         Current function value: 0.443375
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 switch   No. Observations:               871660
Model:                          Logit   Df Residuals:                   871656
Method:                           MLE   Df Model:                            3
Date:                Sun, 10 Nov 2024   Pseudo R-squ.:                0.005681
Time:                        10:10:22   Log-Likelihood:            -3.8647e+05
converged:                       True   LL-Null:                   -3.8868e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.8059      0.005   -370.111      0.000      -1.816      -1.796
E6             0.1967      0.

In [None]:
# Assuming 'result' is your fitted model
# Step 1: Predict probabilities for the test data (X_test)
y_pred_prob = result.predict(X_test)

# Step 2: Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)  # Using 0.5 as the threshold for now

# Step 3: Compare predicted values to actual values (y_test)
correct_predictions = (y_pred == y_test).sum()

# Step 4: Compute accuracy
accuracy = correct_predictions / len(y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8359
