<a href="https://colab.research.google.com/github/doyoung1122/24-2/blob/Introduction-of-A.I/Predicting_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import datasets
import xgboost as xgb
import seaborn as sns

In [2]:
def nulls_by_col(df):
# Calculate the number and percent of null values in each column.
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing/rows
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'pct_rows_missing': pct_missing})
    return cols_missing

In [3]:
def nulls_by_row(df):
# Calculate the number of percent of null values in each row.
    num_cols_missing = df.isnull().sum(axis=1)
    pct_cols_missing = df.isnull().sum(axis=1)/df.shape[1]*100
    rows_missing = pd.DataFrame({'num_cols_missing': num_cols_missing, 'pct_cols_missing': pct_cols_missing}).reset_index().groupby(['num_cols_missing','pct_cols_missing']).count().rename(index=str, columns={'index': 'num_rows'}).reset_index()
    return rows_missing

In [4]:
def df_summary(df):
# Print information about the data including its shape, datatypes, number of values,
# number of null values in each row and column, the number of unique rows, etc.
    print('--- Shape: {}'.format(df.shape))
    print('\n--- Info')
    display(df.info())
    print('\n--- Descriptions')
    display(df.describe(include='all'))
    print('\n--- Nulls By Column')
    display(nulls_by_col(df))
    print('\n--- Nulls By Row')
    display(nulls_by_row(df))
    print('\n---Unique Rows')
    display(df.apply(lambda x: x.nunique()))

In [5]:
def get_scaled_df(df):
# Return a dataframe that contains only numeric data so that we can scale it for XGBoost.
# This is not necessary for this data as it is already scaled, but it is part of a
# pre-existing function that I wrote so I am leaving it here.
    numerics = ['int64', 'float64', 'float']
    scaled_df = df.select_dtypes(include=numerics)
    col = scaled_df.columns
    scaled_df = preprocessing.scale(scaled_df)
    scaled_df = pd.DataFrame(scaled_df, columns=col)
    return scaled_df

In [14]:
def xgb_rank(df,target_variable,feature_percent=80,mode='gain'):
    '''
    This function receives a dataframe and the target variable, and then returns
    a sorted feature list, a sorted scaled feature list, and a dataframe.

    For the input parameters:
        - feature_percent is the optional cut-off (default is 80 percent) for features
        - mode is optional. The default value is 'gain' which shows the importance.
          Another possible value for mode is 'weight.'

     For the returned:
        - feature_list, scaled_features: lists of features, both including those that
          satisfy the cumulative percentage limit.
        - scaled_df: dataframe that has all features in decending order
        - importance_df: dataframe showing all cumulative percent rankings
    '''

    scaled_df = get_scaled_df(df)
    xgb_params = {'max_depth': 8,'seed' : 123}
    # Convert scaled_df.columns.values to a list of strings
    feature_names = list(scaled_df.columns.values.astype(str))
    dtrain = xgb.DMatrix(scaled_df, target_variable, feature_names=feature_names)
    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)
    importance_dict = model.get_score(importance_type=mode)
    sorted_importance_dict = sorted(importance_dict.items(), key=lambda kv: kv[1])
    importance_df = pd.DataFrame.from_dict(sorted_importance_dict)
    importance_df.columns = ['feature',mode]
    importance_df.sort_values(mode, inplace = True)
    importance_df['rank'] = importance_df[mode].rank(ascending = False)
    importance_df.sort_values('rank', inplace = True)
    importance_df.set_index('rank', inplace = True)
    importance_df.reset_index(inplace=True)
    importance_df[mode] = importance_df[mode].apply(lambda x: round(x, 2))
    importance_df['cum_sum'] = round(importance_df[mode].cumsum(),2)
    importance_df['cum_perc'] = round(100*importance_df.cum_sum/importance_df[mode].sum(),2)
    feature_list = []
    scaled_features = []

    for i in range((importance_df.shape[0])):

        feature_name = importance_df.iloc[i,1].replace('scaled_','')
        scaled_name = 'scaled_' + feature_name
        importance_df.iloc[i,1] = feature_name
        cum_percent = importance_df.iloc[i,4]

        if cum_percent > feature_percent:
            break
        else:
            feature_list.append(feature_name)
            scaled_features.append(scaled_name)
    return feature_list, scaled_features, scaled_df, importance_df

In [15]:
diabetes = datasets.load_diabetes() # load data
data = np.c_[diabetes.data, diabetes.target]
columns = np.append(diabetes.feature_names, 'target')
df = pd.DataFrame(data, columns=columns)
print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [16]:
df_summary(df)

--- Shape: (442, 11)

--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


None


--- Descriptions


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0



--- Nulls By Column


Unnamed: 0,num_rows_missing,pct_rows_missing
age,0,0.0
sex,0,0.0
bmi,0,0.0
bp,0,0.0
s1,0,0.0
s2,0,0.0
s3,0,0.0
s4,0,0.0
s5,0,0.0
s6,0,0.0



--- Nulls By Row


Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,0,0.0,442



---Unique Rows


Unnamed: 0,0
age,58
sex,2
bmi,163
bp,100
s1,141
s2,302
s3,63
s4,66
s5,184
s6,56


In [17]:
X = df.drop(columns=['target'])
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
feature_list, scaled_features, scaled_df, importance_df = xgb_rank(X_train, y_train)

Parameters: { "silent" } are not used.



In [19]:
print('feature_list: ', feature_list, '\n')
print('scaled_features: ', scaled_features, '\n')
print('\nscaled_df:')
display(scaled_df.head())
print('\ny_train:')
display(y_train.head())
print('\nimportance_df:')
display(importance_df)

feature_list:  ['s5', 'bmi', 'bp', 's3'] 

scaled_features:  ['scaled_s5', 'scaled_bmi', 'scaled_bp', 'scaled_s3'] 


scaled_df:


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.243962,1.025829,0.063645,1.210733,0.590736,0.383248,0.879258,-0.830724,0.075103,1.492971
1,-2.231571,-0.974821,-1.567897,-0.545554,-1.891763,-2.024327,0.567203,-1.598664,-0.86972,-0.10202
2,0.544026,1.025829,-0.70803,0.698483,-0.227028,0.05071,-0.212936,-0.062784,-0.306027,-1.025437
3,-0.131119,1.025829,0.041597,-1.78959,-0.227028,-0.335034,1.035286,-0.830724,-0.841247,-1.780959
4,1.369204,1.025829,0.438459,-0.106482,0.444707,0.57612,-0.056908,-0.062784,0.175782,0.065873



y_train:


Unnamed: 0,target
74,85.0
26,137.0
45,53.0
389,51.0
154,197.0



importance_df:


Unnamed: 0,rank,feature,gain,cum_sum,cum_perc
0,1.0,s5,5616.95,5616.95,47.28
1,2.0,bmi,1726.68,7343.63,61.81
2,3.0,bp,876.9,8220.53,69.19
3,4.0,s3,755.18,8975.71,75.55
4,5.0,s2,649.69,9625.4,81.01
5,6.0,s1,609.39,10234.79,86.14
6,7.0,s6,540.7,10775.49,90.69
7,8.0,sex,458.25,11233.74,94.55
8,9.0,s4,333.9,11567.64,97.36
9,10.0,age,313.6,11881.24,100.0


In [20]:
full_scaled_df = scaled_df.copy()
full_scaled_df['target'] = preproc

NameError: name 'preproc' is not defined