### Закрепление на практике темы "Feature engineering and preprocessing".

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

In [3]:
use_cols = ['color', 'director_name', 'num_critic_for_reviews', 'duration', 'actor_2_name', 'gross', 'genres', 'num_user_for_reviews',
          'language', 'country', 'content_rating', 'budget', 'title_year', 'imdb_score', 'movie_facebook_likes']

In [19]:
data = pd.read_csv('MovieAssignmentData.csv', usecols=use_cols)
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0


In [20]:
print(data.shape)

(5043, 15)


## Пропущенные значения

### Проверка пропущенных значений

In [5]:
def check_missing(data, output_path=None):
    result = pd.concat([data.isnull().sum(), data.isnull().mean()], axis=1)
    result = result.rename(index=str, columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result

In [10]:
check_missing(data=data)

Unnamed: 0,total missing,proportion
color,19,0.003768
director_name,104,0.020623
num_critic_for_reviews,50,0.009915
duration,15,0.002974
actor_2_name,13,0.002578
gross,884,0.175292
genres,0,0.0
num_user_for_reviews,21,0.004164
language,12,0.00238
country,5,0.000991


### Удаление пропущенных значений

In [15]:
def drop_missing(data, axis=0):
    data_copy = data.copy()
    data_copy = data_copy.dropna(axis=axis, inplace=False)
    return data_copy

In [16]:
data2 = drop_missing(data=data)
data2.shape

(3833, 15)

### Добавление переменной оценки пропущенных значений

In [32]:
def add_var_denote_NA(data, NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(), 1, 0)
        else:
              warn("Нет пропущенных значений" % i)          
    return data_copy
    

In [34]:
data3 = add_var_denote_NA(data=data,NA_col=['budget'])
print(data3.budget_is_NA.value_counts())
data3.head(5)

0    4551
1     492
Name: budget_is_NA, dtype: int64


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,ggg,budget_is_NA
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,0
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,0,0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,0,0
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,0,0
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0,1,1


### Заполнение пропусков выборочным значением

In [38]:
def impute_NA_with_arbitrary(data, impute_value, NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy

In [39]:
data4 = impute_NA_with_arbitrary(data=data, impute_value = -999999, NA_col=['budget'])
data4.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,ggg,budget_-999999
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,237000000.0
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,0,245000000.0
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,0,250000000.0
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0,1,-999999.0


### Заполнение пропущенных значений средним/медианой/модой

In [48]:
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            if strategy=='mean':
                data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
            elif strategy=='median':
                data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
            elif strategy=='mode':
                data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy  

In [49]:
print(data.budget.median())
data5 = impute_NA_with_avg(data=data, strategy='median', NA_col=['budget'])
data5.head(8)

20000000.0


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,ggg,budget_impute_median
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,237000000.0
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,0,245000000.0
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,0,250000000.0
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0,1,20000000.0
5,Color,Andrew Stanton,462.0,132.0,Samantha Morton,73058679.0,Action|Adventure|Sci-Fi,738.0,English,USA,PG-13,263700000.0,2012.0,6.6,24000,0,263700000.0
6,Color,Sam Raimi,392.0,156.0,James Franco,336530303.0,Action|Adventure|Romance,1902.0,English,USA,PG-13,258000000.0,2007.0,6.2,0,0,258000000.0
7,Color,Nathan Greno,324.0,100.0,Donna Murphy,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,387.0,English,USA,PG,260000000.0,2010.0,7.8,29000,0,260000000.0


### Заполнение пропусков значением из "хвоста" распределения

In [50]:
def impute_NA_with_end_of_distribution(data, NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy 

In [51]:
data6 = impute_NA_with_end_of_distribution(data=data,NA_col=['budget'])
data6.head(8)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,ggg,budget_impute_end_of_distri
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,237000000.0
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,0,245000000.0
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,0,250000000.0
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0,1,658097300.0
5,Color,Andrew Stanton,462.0,132.0,Samantha Morton,73058679.0,Action|Adventure|Sci-Fi,738.0,English,USA,PG-13,263700000.0,2012.0,6.6,24000,0,263700000.0
6,Color,Sam Raimi,392.0,156.0,James Franco,336530303.0,Action|Adventure|Romance,1902.0,English,USA,PG-13,258000000.0,2007.0,6.2,0,0,258000000.0
7,Color,Nathan Greno,324.0,100.0,Donna Murphy,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,387.0,English,USA,PG,260000000.0,2010.0,7.8,29000,0,260000000.0


### Заполнение пропусков случайными значениями

In [52]:
def impute_NA_with_random(data,NA_col=[],random_state=0):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_random'] = data_copy[i]
            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i].isnull()].index
            data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy 

In [53]:
data7 = impute_NA_with_random(data=data,NA_col=['budget'])
data7.head(8)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,ggg,budget_random
0,Color,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,237000000.0
1,Color,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,0,245000000.0
3,Color,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,0,250000000.0
4,,Doug Walker,,,Rob Walker,,Documentary,,,,,,,7.1,0,1,12000000.0
5,Color,Andrew Stanton,462.0,132.0,Samantha Morton,73058679.0,Action|Adventure|Sci-Fi,738.0,English,USA,PG-13,263700000.0,2012.0,6.6,24000,0,263700000.0
6,Color,Sam Raimi,392.0,156.0,James Franco,336530303.0,Action|Adventure|Romance,1902.0,English,USA,PG-13,258000000.0,2007.0,6.2,0,0,258000000.0
7,Color,Nathan Greno,324.0,100.0,Donna Murphy,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,387.0,English,USA,PG,260000000.0,2010.0,7.8,29000,0,260000000.0


## Выбросы

### Детекция с помощью выборочных значений

In [62]:
def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para

In [73]:
index,para = outlier_detect_arbitrary(data=data,col='budget',upper_fence=200000000,lower_fence=1000000)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 369
Доля выбросов: 0.07317073170731707
Верхняя граница: 200000000 
Нижняя граница: 1000000


In [74]:
data.loc[index,'budget'].sort_values()

4799    2.180000e+02
5042    1.100000e+03
5040    1.400000e+03
5036    3.250000e+03
5026    4.500000e+03
            ...     
2334    2.127520e+09
2323    2.400000e+09
3005    2.500000e+09
3859    4.200000e+09
2988    1.221550e+10
Name: budget, Length: 369, dtype: float64

### Интерквартильное расстояние

In [75]:
def outlier_detect_IQR(data,col,threshold=3):    
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [81]:
index,para = outlier_detect_IQR(data=data,col='budget',threshold=5)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 27
Доля выбросов: 0.005353955978584176
Верхняя граница: 240000000.0 
Нижняя граница: -189000000.0


### Среднее-среднеквадратичное отклонение

In [82]:
def outlier_detect_mean_std(data,col,threshold=3):
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()   
    para = (Upper_fence, Lower_fence)   
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [83]:
index,para = outlier_detect_mean_std(data=data,col='budget',threshold=3)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 9
Доля выбросов: 0.001784651992861392
Верхняя граница: 658097315.7824392 
Нижняя граница: -578592074.9096639


### Замена выброса выборочным значением

In [87]:
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
    data_copy = data.copy(deep=True)
    for i in col:
        data_copy.loc[outlier_index,i] = value
    return data_copy

In [93]:
data2 = impute_outlier_with_arbitrary(data=data,outlier_index=index,value=-9999999,col=['budget'])

### Удаление выбросов

In [94]:
def drop_outlier(data,outlier_index):
    data_copy = data[~outlier_index]
    return data_copy

In [95]:
data4 = drop_outlier(data=data,outlier_index=index)
print(data4.budget.max())
print(data4.budget.min())

600000000.0
218.0


### Замена выбросов средним/медианой/модой

In [96]:
def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
    data_copy = data.copy(deep=True)
    if strategy=='mean':
        data_copy.loc[outlier_index,col] = data_copy[col].mean()
    elif strategy=='median':
        data_copy.loc[outlier_index,col] = data_copy[col].median()
    elif strategy=='mode':
        data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]           
    return data_copy

In [97]:
data5 = impute_outlier_with_avg(data=data,col='budget', outlier_index=index,strategy='mean')

## Шкалирование данных

In [99]:
X_train, X_test, y_train, y_test = train_test_split(data, data.imdb_score, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((4034, 16), (1009, 16))

### Нормализация данных

In [101]:
ss = StandardScaler().fit(X_train[['budget']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['budget_zscore'] = ss.transform(X_train_copy[['budget']])
print(X_train_copy.head(3))

      color    director_name  num_critic_for_reviews  duration  \
3052  Color  Gregory Poirier                    75.0      95.0   
528   Color      Michael Bay                   122.0     136.0   
1119  Color     John Schultz                    82.0      86.0   

        actor_2_name        gross                                  genres  \
3052      Jake Busey   13558739.0                                  Comedy   
528    Michael Biehn  134006721.0               Action|Adventure|Thriller   
1119  Carter Jenkins   25200412.0  Adventure|Comedy|Family|Fantasy|Sci-Fi   

      num_user_for_reviews language country content_rating      budget  \
3052                 121.0  English     USA              R  11000000.0   
528                  415.0  English     USA              R  75000000.0   
1119                  55.0  English     USA             PG  45000000.0   

      title_year  imdb_score  movie_facebook_likes  ggg  budget_zscore  
3052      2001.0         5.3                   449    0 

In [102]:
print(X_train_copy['budget_zscore'].mean())
print(X_train_copy['budget_zscore'].std())

1.5960599124605062e-16
1.0001373154829645


### Мин-Макс шкалирование

In [103]:
mms = MinMaxScaler().fit(X_train[['budget']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['budget_minmax'] = mms.transform(X_train_copy[['budget']])
print(X_train_copy.head(3))

      color    director_name  num_critic_for_reviews  duration  \
3052  Color  Gregory Poirier                    75.0      95.0   
528   Color      Michael Bay                   122.0     136.0   
1119  Color     John Schultz                    82.0      86.0   

        actor_2_name        gross                                  genres  \
3052      Jake Busey   13558739.0                                  Comedy   
528    Michael Biehn  134006721.0               Action|Adventure|Thriller   
1119  Carter Jenkins   25200412.0  Adventure|Comedy|Family|Fantasy|Sci-Fi   

      num_user_for_reviews language country content_rating      budget  \
3052                 121.0  English     USA              R  11000000.0   
528                  415.0  English     USA              R  75000000.0   
1119                  55.0  English     USA             PG  45000000.0   

      title_year  imdb_score  movie_facebook_likes  ggg  budget_minmax  
3052      2001.0         5.3                   449    0 

In [104]:
print(X_train_copy['budget_minmax'].max())
print(X_train_copy['budget_minmax'].min())

1.0
0.0


## Энкодинг переменных

### One-Hot encoding

In [117]:
data10 = data.copy(deep=True)
data10 = data[['language', 'color']]

In [118]:
data10

Unnamed: 0,language,color
0,English,Color
1,English,Color
2,English,Color
3,English,Color
4,,
...,...,...
5038,English,Color
5039,English,Color
5040,English,Color
5041,English,Color


In [119]:
data9 = pd.get_dummies(data10,drop_first=True)
data9

Unnamed: 0,language_Arabic,language_Aramaic,language_Bosnian,language_Cantonese,language_Chinese,language_Czech,language_Danish,language_Dari,language_Dutch,language_Dzongkha,...,language_Spanish,language_Swahili,language_Swedish,language_Tamil,language_Telugu,language_Thai,language_Urdu,language_Vietnamese,language_Zulu,color_Color
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Oversampling/undersampling

In [194]:
df = pd.read_csv('data.csv')

In [195]:
df['diagnosis'] = [1 if each == "M" else 0 for each in df.diagnosis]

In [196]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [197]:
df['diagnosis'] = df['diagnosis'].dropna()

In [198]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [199]:
df = df.drop(['Unnamed: 32'], axis=1)

In [200]:
X_train, X_test, y_train, y_test = train_test_split(df, df.diagnosis, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((455, 32), (114, 32))

In [201]:
sm = SMOTE(random_state=42)

In [202]:
X_res, y_res = sm.fit_resample(X_train, y_train)

In [203]:
len(y_res)

580

In [204]:
cn = CondensedNearestNeighbour(random_state=42)

In [205]:
X_res, y_res = cn.fit_resample(X_train, y_train)

In [206]:
len(y_res)

255