In [None]:
# Настраиваем импорты.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

In [None]:
# Включаем интерактивный режим (для отображения графиков.
%matplotlib inline

In [None]:
# Вводные.
trainPath = 'data/exam_module_4/1_variant_dna_sequence_mutation_prediction/input/train.csv'
testPath = 'data/exam_module_4/1_variant_dna_sequence_mutation_prediction/input/test.csv'
cvFraction = 0.15
randomCeed = 777

In [None]:
# 1. Определяем тип задачи.

In [None]:
print('Task type: logistic regression or SVM')

In [None]:
# 2. Создаём фреймы и выделяем часть датасета на CV.

In [None]:
#Смотрим огригинальные тренировочные данные.
originalTrainDf = pd.read_csv(trainPath)
originalTrainDf.info()
originalTrainDf.head()

In [None]:
#Смотрим огригинальные тренировочные данные.
originalTestDf = pd.read_csv(testPath)
originalTestDf.info()
originalTestDf.head()

In [None]:
#Формируем датафреймы из псевдорандомных выборок.
trainDf = originalTrainDf.sample(frac=(1 - cvFraction), random_state=randomCeed).drop('mutation', axis=1)
trainDfTarget = originalTrainDf.sample(frac=(1 - cvFraction), random_state=randomCeed)[['ID', 'mutation']]
cvDf = originalTrainDf.drop(trainDf.index).drop('mutation', axis=1)
cvDfTarget = originalTrainDf.drop(trainDfTarget.index)[['ID', 'mutation']]
testDf = originalTestDf

In [None]:
print('Original train data:')
pd.read_csv(trainPath).shape

In [None]:
print('Original test data:')
pd.read_csv(testPath).shape

In [None]:
print('Train data:')
trainDf.shape

In [None]:
print('Train target data:')
trainDfTarget.shape

In [None]:
print('CV data:')
cvDf.shape

In [None]:
print('CV target data:')
cvDfTarget.shape

In [None]:
print('Test data:')
testDf.shape

In [None]:
# 3. Определяем тип переменных в датасете.

In [None]:
print('Train data types:')
trainDf.info()
print('Train target data types:')
trainDfTarget.info()

In [None]:
# 4. Если это необходимо провести препроцессинг данных, нужно ли применять алгоритмы понижения размерности?
# Нужно ли убирать аномалии?

In [None]:
# Переводим первые 9 столбцов в цифры (по методу one-hot, one-hot столбцы добавляются в конце датафрейма).
dummieCounter = 0
for col in trainDf.columns:
    if trainDf[col].dtypes == object:
        dummieCounter += len(trainDf[col].unique())
        print('Unique in ' + str(col) + ': ' + str(len(trainDf[col].unique())))
print('Dummie columns: ' + str(dummieCounter))

trainDf = pd.get_dummies(trainDf)
cvDf = pd.get_dummies(cvDf)
testDf = pd.get_dummies(testDf)

In [None]:
print("TrainDF: ")
trainDf.shape

In [None]:
print('CvDF: ')
cvDf.shape

In [None]:
print("TestDF: ")
testDf.shape

In [None]:
def time_series_plot(df):
    """Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
    print("\nTo check time series of numeric data  by daily, monthly and yearly frequency")
    if len(df.select_dtypes(include='datetime64').columns) > 0:
        for col in df.select_dtypes(include='datetime64').columns:
            for p in ['D', 'M', 'Y']:
                if p == 'D':
                    print("Plotting daily data")
                elif p == 'M':
                    print("Plotting monthly data")
                else:
                    print("Plotting yearly data")
                for col_num in df.select_dtypes(include=np.number).columns:
                    __ = df.copy()
                    __ = __.set_index(col)
                    __T = __.resample(p).sum()
                    ax = __T[[col_num]].plot()
                    ax.set_ylim(bottom=0)
                    ax.get_yaxis().set_major_formatter(
                        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
                    plt.show()


def numeric_eda(df, hue=None):
    """Given dataframe, generate EDA of numeric data"""
    print("\nTo check: \nDistribution of numeric data")
    display(df.describe().T)
    columns = df.select_dtypes(include=np.number).columns
    figure = plt.figure(figsize=(20, 10))
    figure.add_subplot(1, len(columns), 1)
    for index, col in enumerate(columns):
        if index > 0:
            figure.add_subplot(1, len(columns), index + 1)
        sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
    figure.tight_layout()
    plt.show()

    if len(df.select_dtypes(include='category').columns) > 0:
        for col_num in df.select_dtypes(include=np.number).columns:
            for col in df.select_dtypes(include='category').columns:
                fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
                fig.set_xticklabels(rotation=90)
                plt.show()

    # Plot the pairwise joint distributions
    print("\nTo check pairwise joint distribution of numeric data")
    if hue == None:
        sns.pairplot(df.select_dtypes(include=np.number))
    else:
        sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
    plt.show()


def top5(df):
    """Given dataframe, generate top 5 unique values for non-numeric data"""
    columns = df.select_dtypes(include=['object', 'category']).columns
    for col in columns:
        print("Top 5 unique values of " + col)
        print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
              :min(5, len(df[col].value_counts()))])
        print(" ")


def categorical_eda(df, hue=None):
    """Given dataframe, generate EDA of categorical data"""
    print("\nTo check: \nUnique count of non-numeric data\n")
    print(df.select_dtypes(include=['object', 'category']).nunique())
    top5(df)
    # Plot count distribution of categorical data
    for col in df.select_dtypes(include='category').columns:
        fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
        fig.set_xticklabels(rotation=90)
        plt.show()


def eda(df):
    """Given dataframe, generate exploratory data analysis"""
    # check that input is pandas dataframe
    if type(df) != pd.core.frame.DataFrame:
        raise TypeError("Only pandas dataframe is allowed as input")

    # replace field that's entirely space (or empty) with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    print("Preview of data:")
    display(df.head(3))

    print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
    print(df.info())

    # generate preview of entries with null values
    if df.isnull().any(axis=None):
        print("\nPreview of data with null values:")
        display(df[df.isnull().any(axis=1)].head(3))
        missingno.matrix(df)
        plt.show()

    # generate count statistics of duplicate entries
    if len(df[df.duplicated()]) > 0:
        print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
        display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
    else:
        print("\nNo duplicated entries found")

    print('cat')
    # EDA of categorical data
    categorical_eda(df)

    print('num')
    # EDA of numeric data
    numeric_eda(df)
    print('time')
    # Plot time series plot of numeric data
    time_series_plot(df)

In [None]:
eda(trainDf)

In [None]:
# 5. Провести EDA и вывести какие-то умозаключения и посмотреть на распределения признаков, на корреляции, на выбросы.

In [None]:
# 6. Подумать над вариантом модели, для того чтобы решить задачу (либо ансамблем моделей)

In [None]:
print('Так как n << m, лучше использовать логистическую регрессию, либо SMV without kernel.')

In [None]:
# 7. Подумать нужно ли применять Unsupervised learning подход для решения задачи?
# Неоходима ли дополнительная информация?

In [None]:
# 8. Обучить модель и вывести валидационный скор по метрике качества.