In [1]:
# File containing code for utilisation of the project on google colab 

1- Add data in your google drive and link it to this notebook be running :

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

2- Installation of Julia

In [1]:
%%capture
%%shell
if ! command -v julia 3>&1 > /dev/null
then
    wget -q 'https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz' \
        -O /tmp/julia.tar.gz
    tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
    rm /tmp/julia.tar.gz
fi
julia -e 'using Pkg; pkg"add IJulia; precompile;"'
echo 'Done'

Need to *Save* setting julia :
- Edit
- Notebook settings
- Verify that runtime type = julia 1.7
- Save


3- Download Packages

In [None]:
using Pkg;

In [None]:
import Pkg; Pkg.add("DataFrames")
import Pkg; Pkg.add("MLJ")
import Pkg; Pkg.add("MLJLinearModels")
import Pkg; Pkg.add("CSV")
import Pkg; Pkg.add("Random")
import Pkg; Pkg.add("CUDA")
import Pkg; Pkg.add("MLJMultivariateStatsInterface")
import Pkg; Pkg.add("Flux")
import Pkg; Pkg.add("MLJFlux")
import Pkg; Pkg.add("ComputationalResources")

In [None]:
using DataFrames, MLJFlux, CSV, MLJ,  MLJMultivariateStatsInterface, CUDA, Flux, Random, MLJLinearModels, ComputationalResources

4- Add your functions

In [None]:
function fit_and_evaluate(training_data, validation_data, test_data,validation_test)
    """
        fit the data with a multinomailClassifier and evaluate it.

    Arguments:
        training_data {DataFrame} -- training data
        valididation_data {DataFrame} -- training labels
        test_data {DataFrame} -- test data
        validation_test {DataFrame} -- test label

    Returns:
        mach {machine} -- trained machine
        error {DataFrame} -- dataframe of training and test error

    """
    mach = machine(MultinomialClassifier(penalty = :l1, lambda =1e-7), training_data, validation_data)|> gpu |> fit!
    #mach = machine(MultinomialClassifier(penalty = :none), training_data, validation_data)|> gpu |> fit!
    error = DataFrame(trainin_error = mean(predict_mode(mach, training_data) .!= validation_data), test_error = mean(predict_mode(mach, test_data) .!= validation_test))
    return mach, error
end

function data_split(data,y, idx_train, idx_test; shuffle =true)
    """
        Split data between a train and test set

    Arguments:
        data {DataFrame} -- all the data to split
        y {DataFrame} -- labels of the data
        idx_train {UnitRange{Int64}} -- indexes of train data
        idx_test {UnitRange{Int64}} -- indexes of test data
        shuffle {boolean} -- if true shuffle the data

    Returns:
        train {DataFrame} -- training data
        train_valid {DataFrame} -- training labels
        test {DataFrame} -- test data
        test_valid {DataFrame} -- test label

    """
    if shuffle
        idxs = randperm(size(data, 1))
    else
        idxs= 1:size(data, 1)
    end
    return (train = data[idxs[idx_train], :],
    train_valid = y[idxs[idx_train], 1],
    test = data[idxs[idx_test], :],
    test_valid = y[idxs[idx_test], 1])
    end

function multinom_class(x_train, x_test, y)
    """
        Multinomial Classification. Save the prediction in a csv file.

    Arguments:
        x_train {DataFrame} -- train set (without labels)
        x_test {DataFrame} -- test set to predict
        y {DataFrame} -- labels of the training data
        pena {} -- penalty

    Returns:
        mach{} -- machine
    """
    #mach = machine(MultinomialClassifier(penalty = pena, lambda = lambda), x_train, y) |> fit!
    mach = machine(MultinomialClassifier(penalty = :l2, lambda =1e-3), x_train, y)|> gpu |> fit!
    pred = predict_mode(mach, x_test)
    println(pred)
    kaggle_submit(pred, "RidgeClassifier_pca_8000")
    return mach
end


function lasso_classifier(x_train, x_test, y;seed=0, goal, lower, upper)
    """
        Lasso Classification using cross-validation. Save the prediction in a csv file.

    Arguments:
        x_train {DataFrame} -- train set (without labels)
        x_test {DataFrame} -- test set to predict
        y {DataFrame} -- labels of the training data
        seed {int} -- value of the seed to fix
        goal {int} -- number of different lambda to try
        lower {float} -- value of the smallest lambda to try
        upper {float} -- value of the biggest lambda to try

    Returns:
        mach{} -- machine

    """
    Random.seed!(seed)
    model = MultinomialClassifier(penalty = :l1)
    mach_lasso = machine(TunedModel(model = model,
                                    resampling = CV(nfolds = 5),
                                    tuning = Grid(goal = goal),
                                    range = range(model, :lambda, lower = lower, upper = upper, scale = :log10),
                                    measure = MisclassificationRate()),
                                    x_train, y) |> gpu |>fit!
    pred = predict_mode(mach_lasso, x_test)
    kaggle_submit(pred, "LassoClassifier_6_12")  
    return mach_lasso
end

function ridge_classifier(x_train, x_test, y; seed=0, goal, lower, upper)
    """
        Ridge Classification using cross-validation. Save the prediction in a csv file.

    Arguments:
        x_train {DataFrame} -- train set (without labels)
        x_test {DataFrame} -- test set to predict
        y {DataFrame} -- labels of the training data
        seed {int} -- value of the seed to fix
        goal {int} -- number of different lambda to try
        lower {float} -- value of the smallest lambda to try
        upper {float} -- value of the biggest lambda to try

    Returns:
        mach{} -- machine

    """
    Random.seed!(seed)
    model = MultinomialClassifier(penalty = :l2)
    tuned_model_ridge = TunedModel(model = model,
                                    resampling = CV(nfolds = 5),
                                    tuning = Grid(goal = goal),
                                    range = range(model, :lambda, lower = lower, upper = upper, scale = :log10),
                                    measure = MisclassificationRate())
    mach_ridge = machine(tuned_model_ridge, x_train, y) |> gpu |>fit!
    pred = predict_mode(mach_ridge, x_test)
    kaggle_submit(pred, "RidgeClassifier_30_11")  
    return mach_ridge
end

In [None]:
function load_data(path)
    """
        Load the data at "path" localization in the form of a dataframe

    Arguments:
        path {string} -- localization of the data to download

    Returns :
        Dataframe {DataFrame} -- DataFrame with the data
    """
    return DataFrame(CSV.File(path))
end

function remove_constant_predictors(df)
    """
        Remove constant columns in a given DataFrame

    Arguments:
        df {DataFrame} -- data to clean

    Returns :
        df_no_const {DataFrame} -- New DataFrame without constante columns/predictors
    """
    df_no_const = df[:,  std.(eachcol(df)) .!= 0]
    return df_no_const
end

function remove_prop_predictors(df)
    """
        Remove exact correlated columns in a given DataFrame

    Arguments:
        df {DataFrame} -- data to clean

    Returns :
        df_no_const {DataFrame} -- New DataFrame without proportionnal columns/predictors
    """
    corr_pairs = findall(≈(1), cor(Matrix(df))) |> idxs -> filter(x -> x[1] > x[2], idxs)
    corr_pred = getindex.(corr_pairs,1)
    corr_pred = unique(corr_pred)
    return df[:,Not(corr_pred)]
end

function remove_low_call_rates(df)

    rates = zeros(0)
    for column in (eachcol(df))
        append!( rates,(sum(x->x>0, column) /length(df[:,1])) *100)
    end
    call_rates = DataFrame(index = names(df) , rates = rates )

    call_rates =call_rates[(call_rates.rates.>1),:]
    df_clean = select(df, call_rates.index)
    PlotlyJS.plot(call_rates, x=:rates, kind="histogram", nbinsx=20', Layout(title_text="Histogram of call-rates", xaxis_title_text="Pourcentage of call rates in all gene", yaxis_title_text="Number of gene"))
    return df_clean
end

function coef_info(beta_df, x_train)
    df = permutedims(beta_df, 1)
    df.stds = std.(eachcol(x_train))
    df.t_value = df[:,2] ./ df.stds
    df.abs_t = abs.(df.t_value)
    return df
end

function get_names_len(X, y, len)
    mach = machine(MultinomialClassifier(penalty = :none), X, y)
    fit!(mach, verbosity = 0)
    params = fitted_params(mach)
    df = hcat(DataFrame(titles = levels(params.classes)), DataFrame(params.coefs))
    info = DataFrame(genes = names(df[:,2:end]))
    for i in range(1,3,3)
        #info.levels(params.classes)[i] = coef_info(DataFrame(df[i, :]), X)
        info = hcat(info, coef_info(DataFrame(df[Int(i), :]), X).abs_t, makeunique=true)
    end
    info = permutedims(info, 1)
    maxs = DataFrame(genes = names(info[:,2:end]) ,maxs = maximum.(eachcol(info[:,2:end])))
    sort!(maxs, :maxs, rev = true)
    #chosen_names = names(permutedims(maxs[maxs.maxs .> 1, :], 1)[:,2:end])
    #maxs
    return maxs[1:len,:].genes#names(permutedims(maxs[maxs.maxs .> cutoff, :], 1)[:,2:end])
end

function norm(x_train, x_test)
    """
        Normalize the data. Compute the norm and apply it to the data with a MLJ transform

    Arguments:
        x_train {DataFrame} -- train set (without labels) to normalize
        x_test {DataFrame} -- test set to normalize

    Returns :
        norm_data[1:5000,:] {DataFrame} -- Normalized train set (x_train)
        norm_data[5001:end,:] {DataFrame} -- Normalized test set (x_test)
    """
    total_data = vcat(x_train, x_test)
    mach = fit!(machine(Standardizer(), total_data));
    norm_data = MLJ.transform(mach, total_data)
    return norm_data[1:5000,:], norm_data[5001:end,:]
end

function clean_data(train_df, test_df; normalised=false, from_index=true)
    """
        Prepare the data by removing constant and correlated predictors. Can also normalized the data.

    Arguments:
        train_df {DataFrame} -- train set (with labels)
        test_df {DataFrame} -- test set
        normalised {Boolean} -- If true the function normalize train and test data.
        from_index {Boolean} -- Determine the way to clean the data. If true, the function read a dataframe
                                containing the indexes to keep. (This was previously done to gain time in the 
                                pre-processing of the data). If false use functions remove_prop_predictors and 
                                remove_constant_predictors to clean the data.

    Returns :
    x_train {DataFrame} -- cleaned train set 
    x_test {DataFrame} -- cleaned test set
    y {DataFrame} -- labels of the training data
    """

    if from_index
        indexes = load_data("/content/gdrive/MyDrive/Colab Notebooks/MLProject/data/indexes_old.csv") #indexes of the cleaned data, to gain time
        x_train = select(train_df, indexes.index)
        x_test = select(test_df, indexes.index)
        y = coerce!(train_df, :labels => Multiclass).labels
        if normalised
            x_train, x_test = norm(x_train, x_test)
        end

    else
        x_train = remove_constant_predictors(select(train_df, Not(:labels)))
        x_test = remove_constant_predictors(select(test_df, names(x_train)))
        x_train = select(train_df, names(x_test))

        x_test = remove_prop_predictors(x_test)
        x_train = remove_prop_predictors(select(train_df, names(x_test)))
        x_test = select(test_df, names(x_train))

        x_test = remove_low_call_rates(x_test)
        x_train = remove_low_call_rates(select(train_df, names(x_test)))
        x_test = select(test_df, names(x_train))

        y = coerce!(train_df, :labels => Multiclass).labels

        if normalised
            x_train, x_test = norm(x_train, x_test)
        end    

        CSV.write("./data/indexes.csv",DataFrame(index=names(x_train)))
    end
    return x_train,x_test,y
end

function kaggle_submit(df_prediction, title)
    """
        Save a csv file for a kaggle submission with prediction for the test set

    Arguments:
        df_prediction {DataFrame} -- prediction for the test set
        title {string} -- name of the file to save

    """

    prediction_kaggle = DataFrame(id = collect(1:length(df_prediction)))
    prediction_kaggle[!,:prediction] = df_prediction
    CSV.write("/content/gdrive/MyDrive/Colab Notebooks/MLProject/Submission/Submission_$(title).csv", prediction_kaggle)
end

function pca(df,dimension)
    """
        Do a pca 

    Arguments:
        df {DataFrame} -- data on which to do the pca
        dimension {int} -- dimension of the pca

    Returns :
        df_no_const {DataFrame} -- New DataFrame without proportionnal columns/predictors
    """
    return MLJ.transform(fit!(machine(PCA(maxoutdim = dimension), df)), df)
end

function pca_cumvar_plot(training_data)
    pca_gene = fit!(machine(PCA(), training_data), verbosity = 0);
    vars = report(pca_gene).principalvars ./ report(pca_gene).tvar
    return report(pca_gene).principalvars
end

function call_rates(df,pourcent)
    """
        Return columns with low call rates in a given DataFrame. The call rate for a given gene is defined as the proportion of measurement
        for which the corresponding gene information is not 0. We keep only gene whose call rate is > 1%

    Arguments:
        df {DataFrame} -- data to clean

    Returns :
        df_no_const {DataFrame} -- New DataFrame without proportionnal columns/predictors
    """
    rates = zeros(0)
    for column in (eachcol(df))
        append!( rates,(sum(x->x>0, column) /length(df[:,1])) *100)
    end
    call_rates = DataFrame(index = names(df) , rates = rates)
    call_rates = call_rates[(call_rates.rates.>pourcent),:]
    return call_rates.index
end

5- Load data

In [None]:
train_df = (DataFrame(CSV.File("/content/gdrive/MyDrive/Colab Notebooks/MLProject/data/train.csv.gz"))) |> gpu #to use colab gpu
test_df = (DataFrame(CSV.File("/content/gdrive/MyDrive/Colab Notebooks/MLProject/data/test.csv.gz"))) |> gpu

6- Run your code

In [None]:
#clean data
x_train,x_test,y = clean_data(train_df, test_df,normalised=true, from_index=true) |> gpu

In [None]:
#Mean difference selection

mean_CBP = mean.(eachcol(x_train[(y.=="CBP"),:]))
mean_KAT5 = mean.(eachcol(x_train[(y.=="KAT5"),:]))
mean_eGFP = mean.(eachcol(x_train[(y.=="eGFP"),:]))

results_mean= DataFrame(gene = names(x_train), CBP= mean_CBP, KAT5= mean_KAT5, eGFP = mean_eGFP, diff1=abs.(mean_CBP-mean_eGFP), diff2=abs.(mean_eGFP -mean_KAT5), diff3=(abs.(mean_CBP -mean_KAT5)))

sort!(results_mean, [:diff1], rev=true)
selection1 = results_mean[1:6000,:] 
sort!(results_mean, [:diff2], rev=true)
selection2 = results_mean[1:6000,:]
sort!(results_mean, [:diff3], rev=true)
selection3 = results_mean[1:6000,:]

x_train2 = select(x_train, unique([selection1.gene; selection2.gene; selection3.gene]))

x_train2 = MLJ.transform(fit!(machine(PCA(maxoutdim = 3000), x_train2)), x_train2)

Random.seed!(0)

x_test = select(x_test, names(x_train2))
data = vcat(x_train2,x_test)
# Do a PCA to reduce the features to 3000
data = MLJ.transform(fit!(machine(PCA(maxoutdim = 3000), data)), data)

In [None]:
x_train2= data[1:5000,:]
x_test = data[5000:8093,:]

In [None]:
model = NeuralNetworkClassifier( builder = MLJFlux.Short(n_hidden = 128,
σ = relu, dropout = 0.5),
optimiser = ADAM(),
batch_size = 128,
epochs = 2000,
alpha = 0.25)

mach = fit!(machine(model,x_train2, y), verbosity = 1) 

In [None]:
pred = predict_mode(mach, x_test)
kaggle_submit(pred, "NN_mean")