<a href="https://colab.research.google.com/github/evefine/useful_personal/blob/main/analysis_funcs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math
import importlib
import gpcrmining.gpcrdb as db
import csv
import os
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet, HoverTool
from bokeh.layouts import layout
from bokeh.io import curdoc
from multiprocessing import Pool, cpu_count
import functools
import os
import csv
import math
from multiprocessing import Pool, cpu_count
import functools

# Assuming db and other required libraries are imported correctly


def collect_pdb_filenames(path):
    """
    This function remains largely unchanged as its operation is already efficient for its purpose.
    """
    pdb_filenames = []
    gpcr_names = []
    for filename in os.listdir(path):
        if filename.endswith('.pdb'):
            pdb_filenames.append(filename)
            gpcr_names.append(filename[:-9])
    return pdb_filenames, gpcr_names


def get_res_info(gpcr):
    """
    Modified to process a single GPCR at a time to facilitate parallel processing.
    A caching mechanism can be added here if db.get_residue_info(gpcr) is costly.
    """
    return gpcr, db.get_residue_info(gpcr)


def parallel_get_res_info(list_gpcrs):
    """
    Uses multiprocessing to fetch residue info in parallel.
    """
    with Pool(processes=cpu_count()) as pool:
        res_info = pool.map(get_res_info, list_gpcrs)
    return dict(res_info)


def find_matching_values(nested_list, generic_res):
    """
    Searches through a nested list for sublists whose last entry starts with
    the specified generic numbers and optionally matches an atom type if specified.
    Returns a list of tuples with the integer values from the second position of those sublists
    and the atom type ('CA' if not specified), ordered according to generic_res.

    Args:
    - nested_list: A nested list expected to have the format ['string', integer, 'string', 'string'].
    - generic_res: A list of strings with generic numbers such as '2.46' or '2.46CZ' for specific atom types.

    Returns:
    - A list of tuples (integer, atom type) from matching sublists, ordered as per generic_res.
    """
    matching_values = []
    for res in generic_res:
        generic_number = res[:4]  # Always capture the first 4 characters as the generic identifier
        atom_type = 'CA'  # Default atom type
        if len(res) > 4:  # If there are more characters, assume they specify an atom type
            atom_type = res[4:]

        found = False
        for sublist in nested_list:
            if sublist[-1].startswith(generic_number):
                matching_values.append((sublist[1], atom_type))
                found = True
                break  # Found the matching generic number (and atom type)
        if not found:
            matching_values.append(None)  # Append None if no match was found

    return matching_values


def find_coordinates_once(path, res_nums_and_atom_types):
    found_coords_map = {}
    with open(path, 'r') as pdb_file:
        for line in pdb_file:
            if line.startswith("ATOM"):
                resnum = int(line[22:26].strip())
                atom = line[13:15].strip()
                if (resnum, atom) in res_nums_and_atom_types:
                    x = float(line[30:38].strip())
                    y = float(line[38:46].strip())
                    z = float(line[46:54].strip())
                    found_coords_map[(resnum, atom)] = (x, y, z)

    # Order the results based on the input list
    ordered_coords = [found_coords_map.get(pair) for pair in res_nums_and_atom_types]

    return ordered_coords


def compute_distance(coord1, coord2):
    """
    Compute the Euclidean distance between two points in 3D space.

    Parameters:
    - coord1: Coordinates (x, y, z) of the first point.
    - coord2: Coordinates (x, y, z) of the second point.

    Returns:
    The Euclidean distance between the two points.
    """
    if coord1 == None or coord2 == None:
        return None
    return round(math.sqrt(sum((c1 - c2) ** 2 for c1, c2 in zip(coord1, coord2))),3)


def write_csv(lines, csv_path):
    """
    Writes a nested list to a CSV file.

    Args:
    - lines: Nested list where each sublist represents a row in the CSV.
    - csv_path: Path to the CSV file to be written.
    """
    # Open the file at csv_path in write mode ('w')
    with open(csv_path, 'w', newline='') as file:
        writer = csv.writer(file)
        # Write each sublist in lines as a row in the CSV file
        writer.writerows(lines)


def process_file(pdb_file, path, res_info, pairs_flat):
    """
    Process a single file. This function is designed to run in parallel.
    """
    dists = [pdb_file[:-9], pdb_file[:-4][-4:]]
    coords = find_coordinates_once(path + pdb_file, find_matching_values(res_info[pdb_file[:-9]], pairs_flat))

    for i in range(int(len(coords) / 2)):
        dists.append(compute_distance(coords[i * 2], coords[i * 2 + 1]))

    return dists


def get_distances(path, pairs, csv_name):
    filenames, gpcr_names = collect_pdb_filenames(path)
    res_info = parallel_get_res_info(set(gpcr_names))  # Use set to ensure uniqueness

    pairs_flat = [item for sublist in pairs for item in sublist]
    lines = [['GPCR name', 'PDB code'] + [f'{pair[0]}-{pair[1]}' for pair in pairs]]

    # Use functools.partial to create a partial function with some arguments pre-filled
    process_file_partial = functools.partial(process_file, path=path, res_info=res_info, pairs_flat=pairs_flat)

    # Processing files in parallel
    with Pool(processes=cpu_count()) as pool:
        results = pool.map(process_file_partial, filenames)

    lines.extend(results)
    write_csv(lines, paht + csv_name + '.csv')
    print('done!')