In [140]:
# import library
import pandas as pd

# Pandas Series

In [141]:
# Creating a panda series
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])

print(groceries)

eggs       30
apples      6
milk      Yes
bread      No
dtype: object


## Analysis of Pandas Series

In [142]:
# to check shape
print(f"goceries has shape: {groceries.shape}")

# to check dimensions
print(f"groceries has dimensions: {groceries.ndim}")

# to check size/number of elements
print(f"groceries has a total of {groceries.size} elements")

goceries has shape: (4,)
groceries has dimensions: 1
groceries has a total of 4 elements


In [143]:
# to print index labels
print(f"The index of groceries is: {groceries.index}")

# to print the values
print(f"The data in groceries is: {groceries.values}")

The index of groceries is: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')
The data in groceries is: [30 6 'Yes' 'No']


In [144]:
# checking if an index label exists in our panda series
x = 'bananas' in groceries
y = 'bread' in groceries

print(f"Is banana an index label in groceries: {x}")
print(f"Is bread an index label in groceries: {y}")

Is banana an index label in groceries: False
Is bread an index label in groceries: True


## Accessing, Modifying & Deleting elements in Pandas Series

In [145]:
# Accessing values in pandas series
print(f"How many eggs do we need to buy: {groceries['eggs']}")

print(f"Do we need milk and bread: \n{groceries[['milk', 'bread']]}")

How many eggs do we need to buy: 30
Do we need milk and bread: 
milk     Yes
bread     No
dtype: object


In [146]:
# Using .loc (labelled index)
print(f"How many eggs and apples do we need to buy: \n{groceries.loc[['eggs', 'apples']]}")

How many eggs and apples do we need to buy: 
eggs      30
apples     6
dtype: object


In [147]:
# Using index position
print(f"How many eggs and apples do we need to buy: \n{groceries[[0, 1]]}")

How many eggs and apples do we need to buy: 
eggs      30
apples     6
dtype: object


In [148]:
# Using .iloc (integer location)
print(f"Do we need milk and bread: \n{groceries.iloc[[2, 3]]}")

Do we need milk and bread: 
milk     Yes
bread     No
dtype: object


In [149]:
# modifying values in pandas series
groceries['eggs'] = 2

print(groceries)

eggs        2
apples      6
milk      Yes
bread      No
dtype: object


In [150]:
# deleting elements using drop method (Out of place)

print(f"We remove apples out of place: \n{groceries.drop('apples')}")

print('\n')

print(f'Original pandas series: \n{groceries}')

We remove apples out of place: 
eggs       2
milk     Yes
bread     No
dtype: object


Original pandas series: 
eggs        2
apples      6
milk      Yes
bread      No
dtype: object


In [151]:
# deleting elements using drop method (In place)
groceries.drop('apples', inplace=True)

print(f'We remove apples in place: \n{groceries}')

We remove apples in place: 
eggs       2
milk     Yes
bread     No
dtype: object


## Arithematic operations on Pandas Series

In [152]:
fruits = pd.Series(data = [10, 6, 3], index = ['apples', 'oranges', 'bananas'])

print(fruits)

apples     10
oranges     6
bananas     3
dtype: int64


In [153]:
# addition
print(f"fruits + 2: \n{fruits + 2}\n")

# subtraction
print(f"fruits - 2: \n{fruits - 2}\n")

# division
print(f"fruits / 2: \n{fruits / 2}\n")

# multiplication
print(f"fruits * 2: \n{fruits * 2}")

fruits + 2: 
apples     12
oranges     8
bananas     5
dtype: int64

fruits - 2: 
apples     8
oranges    4
bananas    1
dtype: int64

fruits / 2: 
apples     5.0
oranges    3.0
bananas    1.5
dtype: float64

fruits * 2: 
apples     20
oranges    12
bananas     6
dtype: int64


In [154]:
# Mathematical functions from numpy
import numpy as np

# Exponentiation
print(f"Exp(x): \n{np.exp(fruits)}\n")

# Square root
print(f"Sqrt(x): \n{np.sqrt(fruits)}\n")

# Power
print(f"Pow(x,2): \n{np.power(fruits, 2)}")

Exp(x): 
apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

Sqrt(x): 
apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

Pow(x,2): 
apples     100
oranges     36
bananas      9
dtype: int64


# Pandas Dataframe

In [155]:
# creating a simple dataframe
items = {'Bob': pd.Series(data=[245, 25, 55], index = ['bike', 'pants', 'watch']),\
         'Alice':pd.Series(data=[40, 110, 500, 45], index = ['book', 'glass', 'bike', 'pants'])}

print(type(items))

shopping_cart = pd.DataFrame(items)

print('\n')

print(shopping_cart)
print("\n NaN stands for not a number")

<class 'dict'>


         Bob  Alice
bike   245.0  500.0
book     NaN   40.0
glass    NaN  110.0
pants   25.0   45.0
watch   55.0    NaN

 NaN stands for not a number


In [156]:
# creating dataframe without index
data = {'Alice':pd.Series([40, 110, 500, 45]), 'Bob':pd.Series([245, 25, 55])}
data_without_index = pd.DataFrame(data)

print(data_without_index)

   Alice    Bob
0     40  245.0
1    110   25.0
2    500   55.0
3     45    NaN


In [157]:
# creating dataframes using columns & index keywords

bob_shopping_cart = pd.DataFrame(items, columns= ['Bob'])

print(bob_shopping_cart)
print('\n')

# creating shopping cart with selected items

sel_shopping_cart = pd.DataFrame(items, index= ['pants', 'book'])

print(sel_shopping_cart)

# creating a dataframe with particular column and selected index
print('\n')
alice_shopping_cart = pd.DataFrame(items, columns= ['Alice'], index= ['glass', 'bike'])

print(alice_shopping_cart)

       Bob
bike   245
pants   25
watch   55


        Bob  Alice
pants  25.0     45
book    NaN     40


       Alice
glass    110
bike     500


In [158]:
# Loading data into Pandas dataframe

diabetes_df = pd.read_csv('diabetes.csv')

print(type(diabetes_df))

<class 'pandas.core.frame.DataFrame'>


## Analysis of Pandas Dataframe

In [159]:
# to print head of a dataframe
print(diabetes_df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [160]:
# to print tail of a dataframe
print(diabetes_df.tail())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
763                     0.171   63        0  
764                     0.340   27        0  
765                     0.245   30        0  
766                     0.349   47        1  
767                     0.315   23        0  


In [161]:
# count values column wisem
print(diabetes_df.count())

Pregnancies                 768
Glucose                     768
BloodPressure               768
SkinThickness               768
Insulin                     768
BMI                         768
DiabetesPedigreeFunction    768
Age                         768
Outcome                     768
dtype: int64


In [162]:
# count values based on labels

print(diabetes_df.value_counts('Outcome'))

Outcome
0    500
1    268
dtype: int64


In [163]:
# to print info of a dataframe
print(diabetes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [164]:
# to check shape
print(f"The shape is: {diabetes_df.shape}")

The shape is: (768, 9)


In [165]:
# to check data types
print(f"The data types are: \n{diabetes_df.dtypes}")

The data types are: 
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [166]:
# to check dimension
print(f"The dimension is: {diabetes_df.ndim}")


The dimension is: 2


In [167]:
# to check size
print(f"The size is: {diabetes_df.size}")


The size is: 6912


In [168]:
# to get only values
print(f"The values are: \n{shopping_cart.values}")

The values are: 
[[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]


In [169]:
# to get only index
print(f"The index values are: {diabetes_df.index}")

The index values are: RangeIndex(start=0, stop=768, step=1)


In [170]:
# to get only columns
print(f"The columns are: \n{diabetes_df.columns}")

The columns are: 
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [171]:
# number of unique values in each column
print(diabetes_df.nunique())

Pregnancies                  17
Glucose                     136
BloodPressure                47
SkinThickness                51
Insulin                     186
BMI                         248
DiabetesPedigreeFunction    517
Age                          52
Outcome                       2
dtype: int64


In [172]:
# to check mean value - column wise

print(diabetes_df.mean())

Pregnancies                   3.845052
Glucose                     120.894531
BloodPressure                69.105469
SkinThickness                20.536458
Insulin                      79.799479
BMI                          31.992578
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64


In [173]:
# to check standard deviation - column wise

print(diabetes_df.std())

Pregnancies                   3.369578
Glucose                      31.972618
BloodPressure                19.355807
SkinThickness                15.952218
Insulin                     115.244002
BMI                           7.884160
DiabetesPedigreeFunction      0.331329
Age                          11.760232
Outcome                       0.476951
dtype: float64


In [174]:
# minimum value

print(diabetes_df.min())

Pregnancies                  0.000
Glucose                      0.000
BloodPressure                0.000
SkinThickness                0.000
Insulin                      0.000
BMI                          0.000
DiabetesPedigreeFunction     0.078
Age                         21.000
Outcome                      0.000
dtype: float64


In [175]:
# maximum value

print(diabetes_df.max())

Pregnancies                  17.00
Glucose                     199.00
BloodPressure               122.00
SkinThickness                99.00
Insulin                     846.00
BMI                          67.10
DiabetesPedigreeFunction      2.42
Age                          81.00
Outcome                       1.00
dtype: float64


In [176]:
# to print descriptive statistics of a dataframe
print(diabetes_df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [177]:
# to check if different columns correlated of a dataframe
print(diabetes_df.corr())

                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.000000  0.129459       0.141282      -0.081672   
Glucose                      0.129459  1.000000       0.152590       0.057328   
BloodPressure                0.141282  0.152590       1.000000       0.207371   
SkinThickness               -0.081672  0.057328       0.207371       1.000000   
Insulin                     -0.073535  0.331357       0.088933       0.436783   
BMI                          0.017683  0.221071       0.281805       0.392573   
DiabetesPedigreeFunction    -0.033523  0.137337       0.041265       0.183928   
Age                          0.544341  0.263514       0.239528      -0.113970   
Outcome                      0.221898  0.466581       0.065068       0.074752   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies              -0.073535  0.017683                 -0.033523   
Glucose                   0.331357  0.221

In [178]:
# to find missing values
print(diabetes_df.isnull().any())

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool


In [179]:
# to print info of a dataframe
print(diabetes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [180]:
# to find number of missing values
print(diabetes_df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [181]:
print(diabetes_df.groupby('Outcome').mean())

         Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
Outcome                                                                      
0           3.298000  109.980000      68.184000      19.664000   68.792000   
1           4.865672  141.257463      70.824627      22.164179  100.335821   

               BMI  DiabetesPedigreeFunction        Age  
Outcome                                                  
0        30.304200                  0.429734  31.190000  
1        35.142537                  0.550500  37.067164  


## Accessing, Deleting & Modifying Pandas Dataframe

In [182]:
store = {'store1':pd.Series(data=[155, 10, 2, 3], index=['bikes', 'pants', 'watches', 'glasses']),\
         'store2':pd.Series(data=[200, 15, 7, 1], index=['bikes', 'pants', 'watches', 'glasses'])}

store_df = pd.DataFrame(store)
print(store_df)

         store1  store2
bikes       155     200
pants        10      15
watches       2       7
glasses       3       1


In [183]:
# Accessing column in a dataframe

print(store_df['store1'])

bikes      155
pants       10
watches      2
glasses      3
Name: store1, dtype: int64


In [184]:
# Accessing row in a dataframe
print(store_df.loc[['watches']])

print(store_df.iloc[[3]])

         store1  store2
watches       2       7
         store1  store2
glasses       3       1


In [185]:
# Accessing row of a column in a dataframe
store_df.loc[['watches'],['store1']]

Unnamed: 0,store1
watches,2


In [186]:
# Updating values based on labels
print(store_df)

store_df.loc[['watches']] = [4, 5]
print("\n")
print(store_df)

         store1  store2
bikes       155     200
pants        10      15
watches       2       7
glasses       3       1


         store1  store2
bikes       155     200
pants        10      15
watches       4       5
glasses       3       1


In [187]:
# drop rows or columns

print(store_df.drop(['bikes'], axis = 0))

print(store_df.drop(['store1'], axis = 1))

         store1  store2
pants        10      15
watches       4       5
glasses       3       1
         store2
bikes       200
pants        15
watches       5
glasses       1


In [188]:
# renaming columns

store_df.rename(columns={'store2':'Spar Mall'})

Unnamed: 0,store1,Spar Mall
bikes,155,200
pants,10,15
watches,4,5
glasses,3,1


In [189]:
# Correlation
store_df.corr()

Unnamed: 0,store1,store2
store1,1.0,0.999781
store2,0.999781,1.0


## Dealing with NaN

In [190]:
# to count total number of NaN values
x = diabetes_df.isnull().sum().sum()
print(x)

0


In [191]:
# to drop NaN values (axis = 0/1 for row/column)
diabetes_df.dropna(axis=0)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [192]:
# Replace NaN with some other value
diabetes_df.fillna(0)

# We can use different methods to fill NaN values
# df.fillna(method='ffil', axis=0/1) : fills previous values
# df.fillna(method='backfill', axis=0/1) : fills values which are next to NaNtf 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
