In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Combining data

Here, we will combine all of our seven files into one

In [0]:
import pandas as pd
import gc
import os

input_files = [
    '/content/gdrive/My Drive/ip_files/data/061_HEK293T_human_embryonic_kidney_csv.csv',
    '/content/gdrive/My Drive/ip_files/data/065_HEK293T_human_embryonic_kidney_csv.csv',
    '/content/gdrive/My Drive/ip_files/data/066_HEK293T_human_embryonic_kidney_csv.csv',
    '/content/gdrive/My Drive/ip_files/data/067_HEK293T_human_embryonic_kidney_csv.csv',
    '/content/gdrive/My Drive/ip_files/data/068_HEK293T_human_embryonic_kidney_csv.csv',
    '/content/gdrive/My Drive/ip_files/data/073_HEK293T-human_embryonic_kidney_matcsv.csv',
    '/content/gdrive/My Drive/ip_files/data/074_HEK293T-human_embryonic_kidney_csv.csv',
]

define function for transposing data

also, we will want to change column names, because of that transposing

In [0]:
def transpose_and_set_column_names(df):
    """
    1. data will be transposed
    2. the first columns will be removed
    3. the same column will be placed for column names

    :param df: pandas.core.frame
    :return df: pandas.core.frame
    """

    # transpose
    df = df.T

    # set new column names and remove first column
    hg_names = df.iloc[0, :]
    df = df.iloc[1:, :]
    df.columns = hg_names

    return df

function for combining files

In [0]:
import gc

def combine_files(files):
    """
    create pandas data frame for every file
    transpose each one of them and change it's column names
    add class column
    combine them into one file

    :param files: list of paths
    :return: pandas.core.frame
    """

    # there are seven files, transpose and set col names for each
    df061 = pd.read_csv(files[0], index_col=False)
    df061 = transpose_and_set_column_names(df061)

    df065 = pd.read_csv(files[1], index_col=False)
    df065 = transpose_and_set_column_names(df065)

    df066 = pd.read_csv(files[2], index_col=False)
    df066 = transpose_and_set_column_names(df066)

    df067 = pd.read_csv(files[3], index_col=False)
    df067 = transpose_and_set_column_names(df067)

    df068 = pd.read_csv(files[4], index_col=False)
    df068 = transpose_and_set_column_names(df068)

    df073 = pd.read_csv(files[5], index_col=False)
    df073 = transpose_and_set_column_names(df073)

    df074 = pd.read_csv(files[6], index_col=False)
    df074 = transpose_and_set_column_names(df074)

    # also, add class column
    df061['class'] = 1
    df065['class'] = 2
    df066['class'] = 3
    df067['class'] = 4
    df068['class'] = 5
    df073['class'] = 6
    df074['class'] = 7

    gc.collect()
    
    # combine files
    df = pd.concat([df061, df065, df066, df067, df068, df073, df074], axis=0, ignore_index=True)

    return df

this will delete columns with zeros

In [0]:
def filter_zeros(df):
    """
    delete columns that contain nothing else but zeros
    :param df: pandas.core.frame
    :return: filtered pandas.core.frame
    """
    return df.loc[:, (df != 0).any(axis=0)]


we will just test things here

In [0]:
df = combine_files(input_files)
print(df.shape)

(17079, 31222)


Now, we will delete columns that have zero for every value

In [0]:
df = filter_zeros(df)
print(df.shape)

(17079, 22251)


save data frame into csv

In [0]:
df.to_csv('/content/gdrive/My Drive/ip_files/data/combined_data.csv', index=False)

# Outliers

Here we will filter outliers from our data. Local Outlier Factor will be used

In [0]:
from sklearn.neighbors import LocalOutlierFactor

def remove_outliers(d_frame):
    """
    detect outliers with Local Outlier Factor
    delete them
    :param d_frame: pandas.core.frame
    :return: data frame without outliers
    """

    lof = LocalOutlierFactor(n_neighbors=5)
    lof.fit(d_frame)
    lof_factor = lof.negative_outlier_factor_

    outlier_factor = 1.8
    cluster_df = d_frame[lof_factor >= -outlier_factor]
    outlier_df = d_frame[lof_factor < -outlier_factor]

    return cluster_df, outlier_df


Read combined data file

In [0]:
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/combined_data.csv', index_col=False)
print(df.shape)

(17079, 22251)


In [0]:
df, outliers = remove_outliers(df)
print(df.shape)
print(outliers.shape)



(16997, 22251)
(82, 22251)


save data frame

In [0]:
df.to_csv('/content/gdrive/My Drive/ip_files/data/data_without_outliers.csv', index=False)
outliers.to_csv('/content/gdrive/My Drive/ip_files/data/outliers.csv', index=False)

## change class values

They had to be numeric because of Local outlier factor,  but now we will change them like this:
1 -> class1
2 -> class2
etc.

In [0]:
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/data_without_outliers.csv', index_col=False)
outliers_df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/outliers.csv', index_col=False)


In [0]:

df.loc[df['class'] == 1, 'class'] = 'class1'
df.loc[df['class'] == 2, 'class'] = 'class2'
df.loc[df['class'] == 3, 'class'] = 'class3'
df.loc[df['class'] == 4, 'class'] = 'class4'
df.loc[df['class'] == 5, 'class'] = 'class5'
df.loc[df['class'] == 6, 'class'] = 'class6'
df.loc[df['class'] == 7, 'class'] = 'class7'

outliers_df.loc[outliers_df['class'] == 1, 'class'] = 'class1'
outliers_df.loc[outliers_df['class'] == 2, 'class'] = 'class2'
outliers_df.loc[outliers_df['class'] == 3, 'class'] = 'class3'
outliers_df.loc[outliers_df['class'] == 4, 'class'] = 'class4'
outliers_df.loc[outliers_df['class'] == 5, 'class'] = 'class5'
outliers_df.loc[outliers_df['class'] == 6, 'class'] = 'class6'
outliers_df.loc[outliers_df['class'] == 7, 'class'] = 'class7'


In [0]:
df.to_csv('/content/gdrive/My Drive/ip_files/data/data_without_outliers.csv', index=False)
outliers_df.to_csv('/content/gdrive/My Drive/ip_files/data/outliers.csv', index=False)

## test data and outliers


In [0]:
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/data_without_outliers.csv', index_col=False)
print('data dataframe dimensions: {}'.format(df.shape))


data dataframe dimensions: (16997, 22251)


In [0]:
classes = df['class']

In [0]:
from collections import Counter

print('data classes count:')
print(sorted(Counter(classes).items()))

data classes count:
[('class1', 793), ('class2', 66), ('class3', 577), ('class4', 4133), ('class5', 7853), ('class6', 3060), ('class7', 515)]


In [0]:
import pandas as pd

outliers_df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/outliers.csv', index_col=False)
print('outliers data frame dimensions: {}'.format(outliers_df.shape))


outliers data frame dimensions: (82, 22251)


In [0]:
outliers_classes = outliers_df['class']

In [0]:
from collections import Counter

print('outliers classes count:')
print(sorted(Counter(outliers_classes).items()))

outliers classes count:
[('class1', 7), ('class2', 1), ('class3', 4), ('class4', 31), ('class5', 22), ('class6', 6), ('class7', 11)]
