In [53]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Read the data
file_path = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df = pd.read_csv(file_path, names=column_names, header=0, na_values='?')

# Print the first 5 rows of the data
df.head()



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Function,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00824,1.0,PROTEIN SYNTHESIS,cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,CELLULAR ORGANIZATION (proteins are localized ...,cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,PROTEIN SYNTHESIS,cytoplasm
3,G234065,Non-Essential,ATPases,,,,1.0,"CELL RESCUE, DEFENSE, CELL DEATH AND AGEING",cytoplasm
4,G234065,Non-Essential,ATPases,,,,1.0,CELLULAR ORGANIZATION (proteins are localized ...,cytoplasm


Before running any analysis we will run pre-processing on the data to make sure there are no duplicates and provide some normalization

In [54]:
print("Number of duplicated rows: {}.".format(df.duplicated().sum()))

Number of duplicated rows: 0.


There are no duplicates but there are many missing values

In [55]:
# Use the isna method to identify missing values
missing = df.isna()

# Count the number of missing values in each column
missing_counts = missing.sum()

# Print the number of missing values in each column
print(missing_counts[missing_counts > 0])

Essential      133
Class         2657
Complex       1890
Phenotype     1064
Motif         2239
Chromosome       2
dtype: int64


We will do Label encoding to allow for better normalization of the dataset

In [56]:
label_encoder = LabelEncoder()

# Encode the labels
df['GeneID'] = label_encoder.fit_transform(df['GeneID'])
df['Essential'] = label_encoder.fit_transform(df['Essential'])
df['Class'] = label_encoder.fit_transform(df['Class'])
df['Complex'] = label_encoder.fit_transform(df['Complex'])
df['Phenotype'] = label_encoder.fit_transform(df['Phenotype'])
df['Motif'] = label_encoder.fit_transform(df['Motif'])
df['Chromosome'] = label_encoder.fit_transform(df['Chromosome'])
df['Function'] = label_encoder.fit_transform(df['Function'])
df['Localization'] = label_encoder.fit_transform(df['Localization'])


# Print the first 5 rows of the data
df.head()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Function,Localization
0,0,1,6,46,12,175,0,10,2
1,0,1,6,46,12,176,0,4,2
2,0,1,6,46,12,176,0,10,2
3,1,2,0,51,12,235,0,1,2
4,1,2,0,51,12,235,0,4,2


In [57]:
# Normalize the data using Min-Max normalization
df_norm = (df - df.min()) / (df.max() - df.min())

# The normalized data is stored in the df_norm dataframe
df_norm.head()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Function,Localization
0,0.0,0.333333,0.26087,0.901961,1.0,0.744681,0.0,0.833333,0.142857
1,0.0,0.333333,0.26087,0.901961,1.0,0.748936,0.0,0.333333,0.142857
2,0.0,0.333333,0.26087,0.901961,1.0,0.748936,0.0,0.833333,0.142857
3,0.001161,0.666667,0.0,1.0,1.0,1.0,0.0,0.083333,0.142857
4,0.001161,0.666667,0.0,1.0,1.0,1.0,0.0,0.333333,0.142857


In [58]:
df_norm.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GeneID,4345.0,0.442877,0.288978,0.0,0.184669,0.414634,0.66899,1.0
Essential,4345.0,0.567242,0.17934,0.0,0.333333,0.666667,0.666667,1.0
Class,4345.0,0.797158,0.314503,0.0,0.695652,1.0,1.0,1.0
Complex,4345.0,0.759476,0.295526,0.0,0.529412,0.882353,1.0,1.0
Phenotype,4345.0,0.512294,0.336698,0.0,0.25,0.416667,0.916667,1.0
Motif,4345.0,0.689717,0.380467,0.0,0.306383,1.0,1.0,1.0
Chromosome,4345.0,0.489744,0.288273,0.0,0.1875,0.5,0.75,1.0
Function,4345.0,0.417031,0.294864,0.0,0.25,0.333333,0.666667,1.0
Localization,4345.0,0.509683,0.297474,0.0,0.142857,0.714286,0.714286,1.0


We will now split the dataset into a training set and a test set with a 20% hold out

In [59]:
# Split the data into a training set and a validation set
train_data, val_data, train_labels, val_labels = train_test_split(df_norm.drop('Localization', axis=1), df_norm['Localization'], test_size=0.2)

# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Validation set: {} samples".format(val_data.shape[0]))

Training set: 3476 samples
Validation set: 869 samples
