In [None]:
import pandas as pd
import ast

nustru_data = pd.read_csv('/Users/dominiquefastus/master_project/NuStru/nustruDB/NEW_ECOLI_FULL_uniprot_02_sec_struc_updated.csv', converters={'secondary_structure': ast.literal_eval}, nrows=120000)
nustru_data['secondary_structure'] = nustru_data['secondary_structure'].apply(lambda x: list(x.items()))

nustru_data['protein_sequence_start'], nustru_data['protein_sequence_between'], nustru_data['protein_sequence_end'] = nustru_data['protein_sequence'].str[:20], nustru_data['protein_sequence'].str[20:-20], nustru_data['protein_sequence'].str[-20:]
nustru_data['nucleotide_sequence_start'], nustru_data['nucleotide_sequence_between'], nustru_data['nucleotide_sequence_end'] = nustru_data['nucleotide_sequence'].str[:60], nustru_data['nucleotide_sequence'].str[60:-60], nustru_data['nucleotide_sequence'].str[-60:]

nustru_data['secstru_sequence_start'], nustru_data['secstru_sequence_between'], nustru_data['secstru_sequence_end'] = nustru_data['secondary_structure'].apply(lambda x: x[:20]), nustru_data['secondary_structure'].apply(lambda x: x[20:-20]), nustru_data['secondary_structure'].apply(lambda x: x[-20:])
nustru_data['secstru_sequence_start'], nustru_data['secstru_sequence_between'], nustru_data['secstru_sequence_end'] = nustru_data['secstru_sequence_start'].apply(lambda x: dict(x)), nustru_data['secstru_sequence_between'].apply(lambda x: dict(x)), nustru_data['secstru_sequence_end'].apply(lambda x: dict(x))
nustru_data['secondary_structure'] = nustru_data['secondary_structure'].apply(lambda x: dict(x))

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Simplify the secondary structure information: Only keep 'H', 'E', or '-' (none)
def simplify_structure(struct_dict):
    simplified = []
    for pos, struct in struct_dict.items():
        if struct in ['H', 'E']:
            simplified.append(struct)
        else:
            simplified.append('-')  # Treat all other structures as 'none'
    return simplified


def encode_sequence(seq, segment_length=9):  # Using 9 as an arbitrary segment length for this example
    nucleotide_mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    encoded = [nucleotide_mapping.get(nuc, 0) for nuc in seq]  # Default to 0 ('A') if nucleotide is unknown
    # Break down into segments
    segments = [encoded[i:i+segment_length] for i in range(0, len(encoded) - segment_length + 1, 3)]  # Step by 3 to simulate codons
    return segments

# Prepare dataset
X = []  # Features (encoded nucleotide segments)
y = []  # Labels (simplified secondary structure)

for index, row in nustru_data.head(1000).iterrows():
    struct_dict = row['secondary_structure']
    simplified_struct = simplify_structure(struct_dict)
    segments = encode_sequence(row['nucleotide_sequence'])
    segment_struct = simplified_struct[:len(segments)]  # Ensure the structure labels match segment count
    X.extend(segments)
    y.extend(segment_struct)

# Convert lists to arrays for machine learning processing
X = np.array(X)
y = np.array(y)

# Due to the simplification and limitations, let's further reduce the problem to predicting 'H' (helix) vs others
y = np.where(y == 'H', 'H', 'Other')

# Encode labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize and train the RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

accuracy, report


In [None]:
import matplotlib.pyplot as plt

# Extracted metrics from the model output
metrics = {
    'H': {'precision': 0.64, 'recall': 0.60, 'f1-score': 0.62},
    'Other': {'precision': 0.72, 'recall': 0.75, 'f1-score': 0.74}
}

# Categories for the plot
categories = list(metrics.keys())
precision = [metrics[cat]['precision'] for cat in categories]
recall = [metrics[cat]['recall'] for cat in categories]
f1_score = [metrics[cat]['f1-score'] for cat in categories]

# Number of categories
n_categories = len(categories)

# The x locations for the groups
ind = np.arange(n_categories)  
width = 0.25       

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width, precision, width, label='Precision')
rects2 = ax.bar(ind, recall, width, label='Recall')
rects3 = ax.bar(ind + width, f1_score, width, label='F1-Score')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Scores by group and metric')
ax.set_xticks(ind)
ax.set_xticklabels(categories)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()

plt.show()
