## Load Necessary Python Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport # for data definitions
# could use missingno package to visualize missing values in columns?

## Data Collection

Now I will load the p53 mutants raw data file from 2010 and inspect the dataset structure.

In [2]:
filepath = "C:/Users/Shakuntala Mitra/p53_Mutants/data/external/k8.csv"
k8_file = pd.read_csv(filepath, header=None, low_memory=False)

In [3]:
print(k8_file.head())

     0       1       2       3       4       5      6      7      8     \
0  -0.161  -0.014   0.002  -0.036  -0.033  -0.093  0.025  0.005  0.000   
1  -0.158  -0.002  -0.012  -0.025  -0.012  -0.106  0.013  0.005  0.000   
2       ?       ?       ?       ?       ?       ?      ?      ?      ?   
3  -0.169  -0.025  -0.010  -0.041  -0.045  -0.069  0.038  0.014  0.008   
4  -0.183  -0.051  -0.023  -0.077  -0.092  -0.015  0.071  0.027  0.020   

     9     ...    5400   5401   5402    5403    5404    5405   5406    5407  \
0  -0.015  ...   0.013  0.021   0.02   0.016  -0.011   0.003   0.01  -0.007   
1  -0.002  ...  -0.008  0.007  0.015  -0.008  -0.011  -0.004  0.013   0.005   
2       ?  ...       ?      ?      ?       ?       ?       ?      ?       ?   
3  -0.014  ...    0.01  0.025  0.025   0.021  -0.012   0.006  0.016  -0.018   
4  -0.019  ...   0.012   0.05  0.038   0.051  -0.015   0.017  0.027  -0.049   

       5408 5409  
0  inactive  NaN  
1  inactive  NaN  
2  inactive  NaN  
3  i

Here I will load and inspect the labels for the k8 dataset. The labels file contains the names of each mutant protein. The names of the mutant proteins inherently contain the locations of the mutations along the protein's amino acid chain. Changes in amino acids may alter the folding of the protein, and thus lead to loss of function in the p53 protein.

In [4]:
filepath2 = "C:/Users/Shakuntala Mitra/p53_Mutants/data/external/k8_labels.csv"
k8_labels = pd.read_csv(filepath2, header=None)
k8_labels.drop(index=0, inplace=True)
k8_labels.reset_index(inplace=True)
k8_labels.head()

Unnamed: 0,index,0,1
0,1,a119e,inactive
1,2,a119e_l125p,inactive
2,3,a119e_r283k_a353v,inactive
3,4,a161t,inactive
4,5,c135y,inactive


## Data Cleaning

In [5]:
k8_labels.columns

Index(['index', 0, 1], dtype='object')

In [6]:
#print(k8_instance_tags.info)
k8_labels.info

<bound method DataFrame.info of        index                  0         1
0          1              a119e  inactive
1          2        a119e_l125p  inactive
2          3  a119e_r283k_a353v  inactive
3          4              a161t  inactive
4          5              c135y  inactive
...      ...                ...       ...
16767  16768  y220c_t230c_n239y    active
16768  16769  y220c_y234f_n239l    active
16769  16770              y234c  inactive
16770  16771        y234c_a119e  inactive
16771  16772        y234f_n239l    active

[16772 rows x 3 columns]>

In [7]:
# concat k8_file with k8_labels
dfs = [k8_file, k8_labels]
k8_labelled = pd.concat(dfs, axis=1).reindex(k8_file.index)
k8_labelled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5403,5404,5405,5406,5407,5408,5409,index,0.1,1.1
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.016,-0.011,0.003,0.01,-0.007,inactive,,1,a119e,inactive
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,-0.008,-0.011,-0.004,0.013,0.005,inactive,,2,a119e_l125p,inactive
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,inactive,,3,a119e_r283k_a353v,inactive
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.021,-0.012,0.006,0.016,-0.018,inactive,,4,a161t,inactive
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.051,-0.015,0.017,0.027,-0.049,inactive,,5,c135y,inactive


In [8]:
#print(k8_labelled.iloc[:, [5408, 5409, 5410, 5411]])
k8_labelled = k8_labelled.drop(k8_labelled.iloc[:, [5409, 5410]], axis = 1)
#k8_labelled = k8_labelled.drop(k8_labelled.loc['index'], axis=1)
#k8_labelled.reset_index(inplace=True)

In [9]:
k8_labelled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5401,5402,5403,5404,5405,5406,5407,5408,0.1,1.1
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,a119e,inactive
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,a119e_l125p,inactive
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,inactive,a119e_r283k_a353v,inactive
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,a161t,inactive
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,c135y,inactive


In [12]:
# check if the target columns (5408 and 1) match each other
# if yes, drop the duplicate column
print(k8_labelled.iloc[5408] == k8_labelled.iloc[5410]) # True! can drop
k8_labelled.drop([5408],axis=1, inplace=True) #drop that column

0       False
1       False
2       False
3       False
4       False
        ...  
5406    False
5407    False
5408     True
0       False
1        True
Length: 5411, dtype: bool


In [13]:
k8_labelled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,0.1,1.1
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,a119e,inactive
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,a119e_l125p,inactive
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,a119e_r283k_a353v,inactive
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,a161t,inactive
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,c135y,inactive


In [14]:
# change the ?s to NaN to make them easier to drop
k8_labelled = k8_labelled.replace('?', np.NaN)

In [15]:
k8_labelled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,0.1,1.1
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,a119e,inactive
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,a119e_l125p,inactive
2,,,,,,,,,,,...,,,,,,,,,a119e_r283k_a353v,inactive
3,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,a161t,inactive
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,c135y,inactive


In [29]:
# check for rows that are all NaN
k8_NaNs = k8_labelled[k8_labelled.isna().any(axis=1)]
k8_NaNs.head()
# these rows can be dropped

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,0.1,1.1
2,,,,,,,,,,,...,,,,,,,,,a119e_r283k_a353v,inactive
16,,,,,,,,,,,...,-0.014,-0.013,0.006,-0.035,-0.012,-0.011,0.008,0.026,c141y_d228a_n235k_n239m,inactive
187,,,,,,,,,,,...,-0.011,-0.008,0.008,-0.028,-0.011,-0.009,0.01,0.02,g245s_a161r,inactive
189,,,,,,,,,,,...,0.021,0.021,0.024,0.024,-0.01,0.007,0.009,-0.01,g245s_a161t,inactive
191,,,,,,,,,,,...,-0.024,-0.025,-0.002,-0.055,-0.011,-0.016,0.007,0.04,g245s_a161w,inactive


In [26]:
k8_NaNs.shape # this means there are 180 rows to drop

(180, 5410)

In [32]:
# Now we're going to drop the rows with NaNs using df.dropna()
k8_temp = k8_labelled.dropna()
# Make sure to reset the index after dropping the NaN rows
k8_temp = k8_temp.reset_index(drop=True)

In [33]:
k8_temp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,0.1,1.1
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,a119e,inactive
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,a119e_l125p,inactive
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,a161t,inactive
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,c135y,inactive
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,c135y_e285m,inactive


In [35]:
# Here, we're going to double check that all NaNs were dropped
k8_temp.shape

# There are only 16592 rows left! This is the original 16772 minus the 180 rows that contained NaNs

(16592, 5410)

In [36]:
# Now let's look for duplicate rows!
# strategy: check for duplicated mutant names and then check if the corresponding rows are equivalent to each other
# drop only the duplicates, not the originals
k8_dup_names = k8_labels[k8_labels.duplicated()]  
print(k8_dup_names)

# Empty DataFrame! This tells us that there are no duplicated rows in the k8_labels, so no duplicated mutant names. 


Empty DataFrame
Columns: [index, 0, 1]
Index: []


In [37]:
# Let's cross check that by checking k8_file for duplicates in the numerical values.
k8_file_dups = k8_file[k8_file.duplicated()]
print(k8_file_dups)
# Now we can see that the duplicated rows were the rows for which there were no measurements (empty rows). We already dropped all the empty rows, so there are no more duplicated rows to deal with.

      0    1    2    3    4    5    6    7    8    9     ... 5400 5401 5402  \
402      ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
1323     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
1342     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
1514     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
3193     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
3516     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
3592     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
4420     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
5259     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
6948     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
7480     ?    ?    ?    ?    ?    ?    ?    ?    ?    ?  ...    ?    ?    ?   
8107     ?    ?    ?    ?    ?    ?    ?    ?    ?  

In [38]:
# check datatypes
k8_temp.dtypes

# They're all object datatypes
# I will be leaving them as "object" because it is less memory-intensive for computations than using floats

0       object
1       object
2       object
3       object
4       object
         ...  
5405    object
5406    object
5407    object
0       object
1       object
Length: 5410, dtype: object

In [39]:
# save as a new file and then continue analysis on that file (k8_clean)
#k8_clean_file = k8_temp.to_csv(r'C:\Users\Shakuntala Mitra\p53_Mutants\data\interim\k8_clean_data.csv', index=False, header=False)


## Data Definition

In [2]:
filepath3 = r'C:\Users\Shakuntala Mitra\p53_Mutants\data\interim\k8_clean_data.csv'
df = pd.read_csv(filepath3, header=None, low_memory=False)

In [3]:
# https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/big_data.html
# "Disclaimer: This profiling report was generated using a sample of 100 rows of the original dataset."
sample = df.sample(n=100)
profile = sample.profile_report(minimal=True, sort='None', html={'style':{'full_width': True}}, progress_bar=False, title='p53 Mutants Pandas Profiling Report')

In [None]:
profile
#profile.to_file("profile.html")