In [None]:
import numpy as np
from Bio.PDB import PDBParser, NeighborSearch
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
from Bio.SeqUtils.ProtParam import ProteinAnalysis



In [None]:
# 1. Calculate distance between amino acids
def calculate_aa_distance(structure, chain_id, residue1, residue2):
    chain = structure[0][chain_id]
    coord1 = chain[residue1]['CA'].coord
    coord2 = chain[residue2]['CA'].coord
    return np.linalg.norm(coord1 - coord2)



In [None]:
# 2. Identify potential binding pockets
def identify_binding_pockets(structure, probe_radius=1.4, min_size=50):
    atoms = list(structure.get_atoms())
    ns = NeighborSearch(atoms)
    
    surface_atoms = []
    for atom in atoms:
        if len(ns.search(atom.coord, probe_radius)) <= 3:  # Simplified surface detection
            surface_atoms.append(atom)
    
    coords = np.array([atom.coord for atom in surface_atoms])
    clustering = DBSCAN(eps=3, min_samples=min_size).fit(coords)
    
    pockets = []
    for label in set(clustering.labels_):
        if label != -1:  # Exclude noise points
            pocket = coords[clustering.labels_ == label]
            pockets.append(pocket)
    
    return pockets



In [None]:
# 3. Calculate RMSD between protein structures
def calculate_rmsd(structure1, structure2):
    atoms1 = list(structure1.get_atoms())
    atoms2 = list(structure2.get_atoms())
    
    if len(atoms1) != len(atoms2):
        raise ValueError("Structures have different number of atoms")
    
    coords1 = np.array([atom.coord for atom in atoms1])
    coords2 = np.array([atom.coord for atom in atoms2])
    
    diff = coords1 - coords2
    return np.sqrt(np.sum(diff**2) / len(atoms1))



In [None]:
# 4. Simple molecular docking simulation
def simple_docking(protein, ligand, n_conformers=50):
    # Generate ligand conformers
    conformers = AllChem.EmbedMultipleConfs(ligand, numConfs=n_conformers)
    
    # Simple scoring function (minimize distance to binding site)
    binding_site = identify_binding_pockets(protein)[0]  # Assume first pocket is binding site
    binding_site_center = np.mean(binding_site, axis=0)
    
    best_score = float('inf')
    best_conformer = None
    
    for conf_id in range(n_conformers):
        AllChem.MMFFOptimizeMoleculeConfs(ligand, maxIters=500)
        conf = ligand.GetConformer(conf_id)
        ligand_center = conf.GetPositions().mean(axis=0)
        score = np.linalg.norm(ligand_center - binding_site_center)
        
        if score < best_score:
            best_score = score
            best_conformer = conf_id
    
    return best_conformer, best_score



In [None]:
# 5. Predict protein secondary structure
def predict_secondary_structure(sequence):
    # This is a very simplified prediction based on propensities
    propensities = {
        'H': {'A': 1.45, 'E': 1.53, 'L': 1.34},  # Helix
        'E': {'V': 1.7, 'I': 1.6, 'T': 1.2},    # Sheet
        'C': {'G': 1.2, 'P': 1.2, 'S': 1.2}     # Coil
    }
    
    prediction = []
    for aa in sequence:
        scores = {ss: propensities[ss].get(aa, 1.0) for ss in ['H', 'E', 'C']}
        prediction.append(max(scores, key=scores.get))
    
    return ''.join(prediction)



In [None]:
# 6. Visualize protein-ligand interactions
def visualize_protein_ligand(protein, ligand):
    # This function would typically use a visualization library like PyMOL
    # Here we'll just print a simple text representation
    print("Protein-Ligand Interaction Visualization")
    print("P: Protein atom, L: Ligand atom")
    print("----------------------------------------")
    for i in range(10):  # Simplified 10x10 grid
        row = ''
        for j in range(10):
            if i < 5:
                row += 'P '
            else:
                row += 'L '
        print(row)




In [None]:
# 7. Calculate hydrophobicity profile
def hydrophobicity_profile(sequence, window_size=7):
    protparam = ProteinAnalysis(sequence)
    hydrophobicity = protparam.protein_scale(window=window_size, param_dict='kd')
    return hydrophobicity



In [None]:
# 8. Generate conformers of a small molecule
def generate_conformers(smiles, n_conformers=50):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMultipleConfs(mol, numConfs=n_conformers)
    AllChem.MMFFOptimizeMoleculeConfs(mol, maxIters=500)
    return mol



In [None]:
# 9. Calculate electrostatic potential on protein surface
def electrostatic_potential(structure, probe_radius=1.4):
    # This is a simplified calculation
    atoms = list(structure.get_atoms())
    ns = NeighborSearch(atoms)
    
    surface_atoms = []
    for atom in atoms:
        if len(ns.search(atom.coord, probe_radius)) <= 3:  # Simplified surface detection
            surface_atoms.append(atom)
    
    potentials = []
    for atom in surface_atoms:
        # Simplified potential calculation
        charge = 1 if atom.element == 'O' else -1 if atom.element == 'N' else 0
        potentials.append(charge / (np.linalg.norm(atom.coord) + 1))
    
    return surface_atoms, potentials



In [None]:
# 10. Virtual screening of compound library
def virtual_screening(protein, compound_library, n_conformers=10):
    results = []
    for smiles in compound_library:
        ligand = Chem.MolFromSmiles(smiles)
        ligand = Chem.AddHs(ligand)
        best_conformer, score = simple_docking(protein, ligand, n_conformers)
        results.append((smiles, score))
    
    return sorted(results, key=lambda x: x[1])



In [None]:
# 11. Implement a function to perform fragment growing by connecting a core fragment with a library of smaller fragments, considering geometric constraints.
from rdkit import Chem
from rdkit.Chem import AllChem
import random

def generate_fragment_library(n_fragments=1000, max_heavy_atoms=10):
    """Generate a library of small fragments."""
    fragments = []
    for _ in range(n_fragments):
        mol = Chem.MolFromSmiles('C')  # Start with a carbon atom
        for _ in range(random.randint(1, max_heavy_atoms-1)):
            mol = Chem.RWMol(mol)
            atom_idx = random.randint(0, mol.GetNumAtoms()-1)
            new_atom = Chem.Atom(random.choice([6, 7, 8]))  # C, N, or O
            new_idx = mol.AddAtom(new_atom)
            mol.AddBond(atom_idx, new_idx, Chem.BondType.SINGLE)
            mol = mol.GetMol()
        Chem.SanitizeMol(mol)
        fragments.append(mol)
    return fragments

def grow_fragment(core, fragment_library, max_attempts=100):
    """Grow a core fragment by connecting it with smaller fragments."""
    core_mol = Chem.MolFromSmiles(core)
    AllChem.EmbedMolecule(core_mol)
    
    for _ in range(max_attempts):
        # Select a random atom from the core
        core_atom = random.choice(list(core_mol.GetAtoms()))
        if core_atom.GetImplicitValence() == 0:
            continue
        
        # Select a random fragment
        fragment = random.choice(fragment_library)
        
        # Select a random atom from the fragment
        fragment_atom = random.choice(list(fragment.GetAtoms()))
        
        # Try to connect the fragment to the core
        combined = Chem.CombineMols(core_mol, fragment)
        combined_editable = Chem.RWMol(combined)
        combined_editable.AddBond(core_atom.GetIdx(), 
                                  fragment_atom.GetIdx() + core_mol.GetNumAtoms(), 
                                  Chem.BondType.SINGLE)
        
        try:
            Chem.SanitizeMol(combined_editable)
            AllChem.EmbedMolecule(combined_editable)
            # Check for steric clashes
            if AllChem.MMFFOptimizeMoleculeConfs(combined_editable, maxIters=500)[0] == 0:
                return Chem.MolToSmiles(combined_editable)
        except:
            continue
    
    return None

# Example usage
fragment_library = generate_fragment_library()
core_fragment = "c1ccccc1"  # Benzene ring as core
grown_molecule = grow_fragment(core_fragment, fragment_library)
print(f"Grown molecule: {grown_molecule}")

In [None]:
# 12. Create a script to calculate quantum mechanical descriptors (e.g., HOMO-LUMO energies, dipole moment) for a set of small molecules using an appropriate quantum chemistry package.
from pyscf import gto, scf, dft
import pandas as pd

def calculate_qm_descriptors(smiles):
    """Calculate quantum mechanical descriptors for a molecule."""
    mol = gto.Mole()
    mol.build(atom = smiles, basis = '6-31g')
    
    # Run DFT calculation
    mf = dft.RKS(mol)
    mf.xc = 'b3lyp'
    mf.kernel()
    
    # Calculate descriptors
    homo_energy = mf.mo_energy[mol.nelectron//2-1]
    lumo_energy = mf.mo_energy[mol.nelectron//2]
    homo_lumo_gap = lumo_energy - homo_energy
    dipole_moment = mf.dip_moment()
    total_energy = mf.e_tot
    
    return {
        'HOMO Energy': homo_energy,
        'LUMO Energy': lumo_energy,
        'HOMO-LUMO Gap': homo_lumo_gap,
        'Dipole Moment': dipole_moment[2],  # z-component
        'Total Energy': total_energy
    }

def process_molecule_set(smiles_list):
    """Process a set of molecules and calculate their QM descriptors."""
    results = []
    for smiles in smiles_list:
        try:
            descriptors = calculate_qm_descriptors(smiles)
            descriptors['SMILES'] = smiles
            results.append(descriptors)
        except Exception as e:
            print(f"Error processing {smiles}: {str(e)}")
    
    return pd.DataFrame(results)

# Example usage
molecules = [
    'C',  # Methane
    'CC',  # Ethane
    'O=C=O',  # Carbon dioxide
    'N#N',  # Nitrogen
    'O'  # Oxygen
]

results_df = process_molecule_set(molecules)
print(results_df)

In [None]:
# 13. Develop a basic generative model (e.g., a character-level RNN) for generating SMILES strings of drug-like molecules.
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def create_smiles_dataset(smiles_list, seq_length):
    """Create input-output pairs for training the RNN."""
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(smiles_list)
    total_chars = len(tokenizer.word_index) + 1
    
    input_sequences = []
    output_sequences = []
    
    for smile in smiles_list:
        encoded = tokenizer.texts_to_sequences([smile])[0]
        for i in range(1, len(encoded)):
            input_sequences.append(encoded[:i])
            output_sequences.append(encoded[i])
    
    input_sequences = pad_sequences(input_sequences, maxlen=seq_length, padding='pre')
    output_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=total_chars)
    
    return input_sequences, output_sequences, tokenizer, total_chars

def build_model(total_chars, seq_length):
    """Build the RNN model."""
    model = Sequential([
        Embedding(total_chars, 64, input_length=seq_length),
        LSTM(128, return_sequences=True),
        LSTM(128),
        Dense(64, activation='relu'),
        Dense(total_chars, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def generate_smiles(model, tokenizer, seq_length, start_char='C', num_chars=50):
    """Generate a new SMILES string."""
    generated = start_char
    for _ in range(num_chars):
        encoded = tokenizer.texts_to_sequences([generated])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre')
        prediction = model.predict(encoded, verbose=0)
        next_char_index = np.argmax(prediction[0])
        next_char = tokenizer.index_word[next_char_index]
        generated += next_char
        if next_char == '\n':
            break
    return generated

# Example usage
smiles_list = [
    'CC(=O)OC1=CC=CC=C1C(=O)O',
    'CC1=C(C(=O)NO)C(=O)C2=C(C1=O)C=CC=C2',
    'CC1=C(C(=O)C2=C(C1=O)C=CC=C2)C(=O)O',
    'CC1=CC=C(C=C1)C2=C(C(=O)O)C(=O)C3=CC=CC=C3O2',
    'CC1=NC=C(C(=N1)N)C(=O)N'
]

seq_length = 50
input_sequences, output_sequences, tokenizer, total_chars = create_smiles_dataset(smiles_list, seq_length)

model = build_model(total_chars, seq_length)
model.fit(input_sequences, output_sequences, epochs=50, batch_size=32)

# Generate new SMILES strings
for _ in range(5):
    generated_smiles = generate_smiles(model, tokenizer, seq_length)
    print(generated_smiles)

In [None]:
# 14. Write a program to perform a simple QM/MM calculation on a protein-ligand complex, treating the ligand quantum mechanically and the protein with molecular mechanics.
from pyscf import gto, scf, qmmm
from pyscf.geomopt import berny_optimizer
import numpy as np

def prepare_protein_ligand_complex(protein_pdb, ligand_xyz):
    """Prepare the protein-ligand complex for QM/MM calculation."""
    # This is a simplified representation. In practice, you'd need to parse PDB files
    # and set up force fields for the protein.
    protein_coords = np.loadtxt(protein_pdb)
    ligand_coords = np.loadtxt(ligand_xyz)
    
    qm_coords = ligand_coords
    mm_coords = protein_coords
    
    return qm_coords, mm_coords

def run_qmmm(qm_coords, mm_coords):
    """Run QM/MM calculation."""
    mol = gto.Mole()
    mol.atom = qm_coords
    mol.basis = '6-31g'
    mol.build()

    def energy_nuc(mol, qm_coords, mm_coords, charges):
        q = mol.atom_charges()
        r = mol.atom_coords()
        e_nuc = mol.energy_nuc()
        for i in range(len(mm_coords)):
            r12 = np.linalg.norm(r-mm_coords[i], axis=1)
            e_nuc += np.sum(q*charges[i]/r12)
        return e_nuc

    charges = np.ones(len(mm_coords))  # Simplified: assume all MM atoms have +1 charge
    mf = scf.RHF(mol)
    mf = qmmm.mm_charge(mf, mm_coords, charges)

    def energy_tot(mol, qm_coords):
        mol.set_geom_(qm_coords, unit='Bohr')
        return mf.kernel()

    mol_eq = berny_optimizer.optimize(mol, energy_tot)
    return mol_eq

def analyze_results(mol_eq):
    """Analyze the results of the QM/MM calculation."""
    mf = scf.RHF(mol_eq)
    mf.kernel()
    
    print("Optimized QM geometry:")
    print(mol_eq.atom_coords())
    print("\nTotal energy:", mf.e_tot)
    print("HOMO-LUMO gap:", mf.mo_energy[mol_eq.nelectron//2] - mf.mo_energy[mol_eq.nelectron//2-1])

# Example usage
qm_coords, mm_coords = prepare_protein_ligand_complex("protein.pdb", "ligand.xyz")
mol_eq = run_qmmm(qm_coords, mm_coords)
analyze_results(mol_eq)

In [None]:
# 15. Implement a reinforcement learning algorithm to optimize generated molecules towards desired properties.
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt

class MoleculeEnvironment:
    def __init__(self, target_weight=500, target_logp=2.5):
        self.target_weight = target_weight
        self.target_logp = target_logp
        self.mol = Chem.MolFromSmiles('C')
        self.done = False
        
    def reset(self):
        self.mol = Chem.MolFromSmiles('C')
        self.done = False
        return self._get_state()
    
    def step(self, action):
        if self.done:
            return self._get_state(), 0, True
        
        if action == 0:  # Add carbon
            new_atom = Chem.Atom(6)
        elif action == 1:  # Add nitrogen
            new_atom = Chem.Atom(7)
        elif action == 2:  # Add oxygen
            new_atom = Chem.Atom(8)
        else:
            return self._get_state(), -1, False
        
        rwmol = Chem.RWMol(self.mol)
        new_idx = rwmol.AddAtom(new_atom)
        
        # Try to add a bond to a random existing atom
        if rwmol.GetNumAtoms() > 1:
            existing_idx = np.random.randint(0, rwmol.GetNumAtoms() - 1)
            rwmol.AddBond(existing_idx, new_idx, Chem.BondType.SINGLE)
        
        try:
            Chem.SanitizeMol(rwmol)
            self.mol = rwmol.GetMol()
        except:
            return self._get_state(), -1, False
        
        reward = self._calculate_reward()
        self.done = rwmol.GetNumAtoms() >= 10 or reward > 0.9
        
        return self._get_state(), reward, self.done
    
    def _get_state(self):
        return Chem.MolToSmiles(self.mol)
    
    def _calculate_reward(self):
        mol_weight = Descriptors.ExactMolWt(self.mol)
        mol_logp = Descriptors.MolLogP(self.mol)
        
        weight_score = 1 - abs(mol_weight - self.target_weight) / self.target_weight
        logp_score = 1 - abs(mol_logp - self.target_logp) / max(abs(self.target_logp), 1)
        
        return (weight_score + logp_score) / 2

class QLearningAgent:
    def __init__(self, n_actions, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.q_table = {}
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.n_actions = n_actions
    
    def get_action(self, state):
        if state not in self.q_table:
            self.q_table[state] = np.zeros(self.n_actions)
        
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        else:
            return np.argmax(self.q_table[state])
    
    def update(self, state, action, reward, next_state):
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(self.n_actions)
        
        current_q = self.q_table[state][action]
        max_next_q = np.max(self.q_table[next_state])
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q

# Training loop
env = MoleculeEnvironment()
agent = QLearningAgent(n_actions=3)
n_episodes = 1000

episode_rewards = []
best_molecule = None
best_reward = -float('inf')

for episode in range(n_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        agent.update(state, action, reward, next_state)
        
        state = next_state
        total_reward += reward
        
        if reward > best_reward:
            best_reward = reward
            best_molecule = state
    
    episode_rewards.append(total_reward)
    
    if episode % 100 == 0:
        print(f"Episode {episode}, Average Reward: {np.mean(episode_rewards[-100:]):.4f}, Best Molecule: {best_molecule}")

# Plot the learning curve
plt.plot(range(n_episodes), episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Learning Curve')
plt.show()

# Print the best molecule and its properties
print(f"\nBest Molecule: {best_molecule}")
best_mol = Chem.MolFromSmiles(best_molecule)
print(f"Molecular Weight: {Descriptors.ExactMolWt(best_mol):.2f}")
print(f"LogP: {Descriptors.MolLogP(best_mol):.2f}")

# Visualize the best molecule
img = Chem.Draw.MolToImage(best_mol)
img.save("best_molecule.png")
print("Best molecule structure saved as 'best_molecule.png'")

In [None]:
# Example usage:
# parser = PDBParser()
# structure = parser.get_structure('protein', 'protein.pdb')
# distance = calculate_aa_distance(structure, 'A', 10, 20)

# pockets = identify_binding_pockets(structure)

# structure2 = parser.get_structure('protein2', 'protein2.pdb')
# rmsd = calculate_rmsd(structure, structure2)

# ligand = Chem.MolFromSmiles('CCO')
# best_conf, score = simple_docking(structure, ligand)

# sequence = "MKWVTFISLLLLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKALVLIAFAQYLQQCPFEDHVKLVNEVTEFAKTCVADESAENCDKS"
# ss_prediction = predict_secondary_structure(sequence)

# visualize_protein_ligand(structure, ligand)

# hydrophobicity = hydrophobicity_profile(sequence)

# conformers = generate_conformers('CCO')

# surface_atoms, potentials = electrostatic_potential(structure)

# compound_library = ['CCO', 'CCC', 'CCCO']
# screening_results = virtual_screening(structure, compound_library)