In [1]:
import os
from os import listdir
import pandas as pd
import numpy as np
import glob
import tqdm
from typing import Dict
import matplotlib.pyplot as plt
%matplotlib inline

#plotly

import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

#color
from colorama import Fore, Back, Style

import seaborn as sns
sns.set(style="whitegrid")

#pydicom
import pydicom

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# Settings for pretty nice plots
plt.style.use('fivethirtyeight')
plt.show()

In [12]:
###(bs) column [ attribute & dataType ] checking ###
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print(train_df.info())
print("====================================================================")
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1549 non-null   object 
 1   Weeks          1549 non-null   int64  
 2   FVC            1549 non-null   int64  
 3   Percent        1549 non-null   float64
 4   Age            1549 non-null   int64  
 5   Sex            1549 non-null   object 
 6   SmokingStatus  1549 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 84.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 

In [13]:
train_df.describe()

Unnamed: 0,Weeks,FVC,Percent,Age
count,1549.0,1549.0,1549.0,1549.0
mean,31.861846,2690.479019,77.672654,67.188509
std,23.24755,832.770959,19.823261,7.057395
min,-5.0,827.0,28.877577,49.0
25%,12.0,2109.0,62.8327,63.0
50%,28.0,2641.0,75.676937,68.0
75%,47.0,3171.0,88.621065,72.0
max,133.0,6399.0,153.145378,88.0


In [4]:
###(bs) Missing values checking ###
train_df.isnull().sum()

Patient          0
Weeks            0
FVC              0
Percent          0
Age              0
Sex              0
SmokingStatus    0
dtype: int64

In [5]:
test_df.isnull().sum()

Patient          0
Weeks            0
FVC              0
Percent          0
Age              0
Sex              0
SmokingStatus    0
dtype: int64

In [28]:
# unique patient 
train_df['Patient'].unique()

array(['ID00007637202177411956430', 'ID00009637202177434476278',
       'ID00010637202177584971671', 'ID00011637202177653955184',
       'ID00012637202177665765362', 'ID00014637202177757139317',
       'ID00015637202177877247924', 'ID00019637202178323708467',
       'ID00020637202178344345685', 'ID00023637202179104603099',
       'ID00025637202179541264076', 'ID00026637202179561894768',
       'ID00027637202179689871102', 'ID00030637202181211009029',
       'ID00032637202181710233084', 'ID00035637202182204917484',
       'ID00038637202182690843176', 'ID00042637202184406822975',
       'ID00047637202184938901501', 'ID00048637202185016727717',
       'ID00051637202185848464638', 'ID00052637202186188008618',
       'ID00060637202187965290703', 'ID00061637202188184085559',
       'ID00062637202188654068490', 'ID00067637202189903532242',
       'ID00068637202190879923934', 'ID00072637202198161894406',
       'ID00073637202198167792918', 'ID00075637202198610425520',
       'ID000766372021990

In [35]:
train_df['Patient'].value_counts()

ID00119637202215426335765    10
ID00400637202305055099402    10
ID00421637202311550012437    10
ID00099637202206203080121    10
ID00105637202208831864134    10
                             ..
ID00052637202186188008618     7
ID00128637202219474716089     7
ID00276637202271694539978     7
ID00047637202184938901501     6
ID00267637202270790561585     6
Name: Patient, Length: 176, dtype: int64

In [36]:
train_df['Patient'].value_counts().count()



176

In [8]:
# count of (image files for each patient , patient folder) 
files = folders = 0

path = "../input/train/"

for _, dirnames, filenames in os.walk(path):
  # ^ this idiom means "we won't be using this value"
    files += len(filenames)
    folders += len(dirnames)
#print(Fore.YELLOW +"Total Patients in Train set: ",Style.RESET_ALL,train_df['Patient'].count())
print(Fore.YELLOW +f'{files:,}',Style.RESET_ALL,"files/images, " + Fore.BLUE + f'{folders:,}',Style.RESET_ALL ,'folders/patients')

[33m33,026 [0m files/images, [34m176 [0m folders/patients


In [31]:
#creating new dataframe 
# Creating unique patient lists and their properties. 
train_dir = '../input/train/'
test_dir = '../input/test/'

patient_ids = os.listdir(train_dir)
patient_ids = sorted(patient_ids)

#Creating new rows
no_of_instances = []
age = []
sex = []
smoking_status = []

for patient_id in patient_ids:
    patient_info = train_df[train_df['Patient'] == patient_id].reset_index()
    no_of_instances.append(len(os.listdir(train_dir + patient_id)))
    age.append(patient_info['Age'][0])
    sex.append(patient_info['Sex'][0])
    smoking_status.append(patient_info['SmokingStatus'][0])

#Creating the dataframe for the patient info    
patient_df = pd.DataFrame(list(zip(patient_ids, no_of_instances, age, sex, smoking_status)), 
                                 columns =['Patient', 'no_of_instances', 'Age', 'Sex', 'SmokingStatus'])
print(patient_df.info())
patient_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Patient          176 non-null    object
 1   no_of_instances  176 non-null    int64 
 2   Age              176 non-null    int64 
 3   Sex              176 non-null    object
 4   SmokingStatus    176 non-null    object
dtypes: int64(2), object(3)
memory usage: 7.0+ KB
None


Unnamed: 0,Patient,no_of_instances,Age,Sex,SmokingStatus
0,ID00007637202177411956430,30,79,Male,Ex-smoker
1,ID00009637202177434476278,394,69,Male,Ex-smoker
2,ID00010637202177584971671,106,60,Male,Ex-smoker
3,ID00011637202177653955184,31,72,Male,Ex-smoker
4,ID00012637202177665765362,49,65,Male,Never smoked


In [40]:
# Exploring SmokingStatus column
patient_df['SmokingStatus'].value_counts()

Ex-smoker           118
Never smoked         49
Currently smokes      9
Name: SmokingStatus, dtype: int64

In [56]:
#
sum  = patient_df['SmokingStatus'].value_counts().sum()
smokepercent =[ round((x/sum)*100) for x in patient_df['SmokingStatus'].value_counts()]
smokepercent_0=[]
for perc in smokepercent:
    smokepercent_0.append(str(perc)+"%")
sm_df= pd.DataFrame(data=smokepercent_0, index=["ex-smoker","Never smoked","Currently smokes"], columns=['A'])
sm_df

Unnamed: 0,A
ex-smoker,67%
Never smoked,28%
Currently smokes,5%


In [57]:
# show 
patient_df['SmokingStatus'].value_counts().iplot(kind='bar',
                                              yTitle='Counts', 
                                              linecolor='black', 
                                              opacity=0.7,
                                              color='blue',
                                              theme='pearl',
                                              bargap=0.5,
                                              gridcolor='white',
                                              title='Distribution of the SmokingStatus column in the Unique Patient Set')

In [58]:
#Weeks column
train_df['Weeks'].value_counts()

 8     45
 12    44
 18    42
 10    41
 6     40
       ..
 95     1
 93     1
 91     1
 88     1
-2      1
Name: Weeks, Length: 112, dtype: int64

In [59]:
train_df['Weeks'].value_counts().iplot(kind='barh',
                                      xTitle='Counts(Weeks)', 
                                      linecolor='black', 
                                      opacity=0.7,
                                      color='#FB8072',
                                      theme='pearl',
                                      bargap=0.2,
                                      gridcolor='white',
                                      title='Distribution of the Weeks in the training set')

In [62]:
train_df['Weeks'].iplot(kind='hist',
                              xTitle='Weeks', 
                              yTitle='Counts',
                              linecolor='black', 
                              opacity=0.5,
                              color='#FB8072',
                              theme='pearl',
                              bargap=0.2,
                              gridcolor='white',
                              title='Distribution of the Weeks in the training set')

In [63]:
fig = px.scatter(train_df, x="Weeks", y="Age", color='Sex')
fig.show()