In [3]:
import pandas as pd

In [2]:
myDataSet = {
    'Name': ['Arya', 'Dustin', 'Ron'],
    'Age': [14, 12, 14]
}

In [3]:
df = pd.DataFrame(myDataSet)
print(df)

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14


Series

In [29]:
# Series-list
x = [10, 20, 30]
s = pd.Series(x)
print(s)
print(s[0])

0    10
1    20
2    30
dtype: int64
10


In [12]:
x = [10, 20, 30]
s = pd.Series(x, index=["x", "y", "z"])
print(s)

x    10
y    20
z    30
dtype: int64


In [30]:
# Series-dictionary
x = {"day1": 10, "day2": 20, "day3": 30}
s = pd.Series(x)
print(s)

day1    10
day2    20
day3    30
dtype: int64


DataFrame

In [4]:
data = {
    'Name': ['Arya', 'Dustin', 'Ron', 'Emma', 'Harry', 'El','Steve', None],
    'Age': [14, 12, 14, 13, 11, 12, 20, 20]
}
df = pd.DataFrame(data)
print(df)

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   20
7    None   20


In [18]:
# loc attribute to return one or more specified row
print(df.loc[0])

Name    Arya
Age       14
Name: 0, dtype: object


In [19]:
print(df.loc[[0, 1]])   # double square []

     Name  Age
0    Arya   14
1  Dustin   12


In [25]:
# head() method returns the headers and a specified number of rows, starting from the top.
print(df.head())   # by default starting 5

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11


In [42]:
print(df.tail())

    Name  Age
3   Emma   13
4  Harry   11
5     El   12
6  Steve   20
7   None   20


In [28]:
# info() method that gives you more information about the data set.
# also tells us how many Non-Null values there are present in each column
print(df.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    7 non-null      object
 1   Age     7 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 244.0+ bytes
None


Cleaning Data

Cleaning Empty Cells

In [48]:
# remove rows that contain empty cells.
# dropna() method returns a new DataFrame, and will not change the original.
r = df.dropna()
print(r.to_string())     # to_string() provides a detailed, untruncated view of the entire DataFrame

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   20


In [45]:
print(df.to_string())
      

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   20
7    None   20


In [49]:
r = df.dropna(inplace = True)   #  it will remove all rows containing NULL values from the original DataFrame.
print(df.to_string())

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   20


In [None]:
 # Remove rows with a NULL value in the "Name" column
r = df.dropna(subset=["Name"], inplace=True)

In [None]:
# fillna() method allows us to replace empty cells with a value
df.fillna('Unknown', inplace=True)

In [None]:
# replace in a specific col.
df["Name"].fillna('Unknown', inplace=True)

In [51]:
# returns all cols. in a DataDrame
df.columns

Index(['Name', 'Age'], dtype='object')

In [None]:
# mean() -> the average value (the sum of all values divided by number of values)
x = df['Age'].mean()
df['Age'].fillna(x, inplace=True)

In [None]:
# median() -> the value in the middle, after you have sorted all values ascending
x = df['Age'].median()
df['Age'].fillna(x, inplace=True)

In [None]:
# mode() -> the value that appears most frequently.
x = df['Age'].mode()[0]     # returns a Series (as there can be more than one mode), selects the first mode from the Series.
df['Age'].fillna(x, inplace=True)

Cleaning Wrong Data

In [53]:
df.loc[6, 'Age'] = 19    # replace value at index 6 in Age col with 19
print(df)

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   19


In [55]:
for x in df.index:
    if df.loc[x, 'Age'] > 15:
        df.loc[x, 'Age'] = 15
print(df)

     Name  Age
0    Arya   14
1  Dustin   12
2     Ron   14
3    Emma   13
4   Harry   11
5      El   12
6   Steve   15


In [None]:
for x in df.index:
    if df.loc[x, 'Age'] > 15:
        df.drop(x, inplace=True)     #  is more general and allows you to remove rows or columns based on index position

Removing Duplicates

In [None]:
df.duplicated()   # returns True for every row that is a duplicate, otherwise False

In [None]:
df.drop_duplicates(inplace=True)

Other methods

In [7]:
x = df['Age'].sum()
print(x)

116


In [10]:
x = df.sort_values(by='Age', ascending=False)
print(x)

     Name  Age
6   Steve   20
7    None   20
0    Arya   14
2     Ron   14
3    Emma   13
1  Dustin   12
5      El   12
4   Harry   11


In [11]:
# add new col to existing df
df['Gender']=['M','M','M','F','M','F','M','F']

In [12]:
print(df)

     Name  Age Gender
0    Arya   14      M
1  Dustin   12      M
2     Ron   14      M
3    Emma   13      F
4   Harry   11      M
5      El   12      F
6   Steve   20      M
7    None   20      F


In [26]:
# add a new row to existing df
new_row = ['Lexi', 11, 'F']
df.loc[len(df)] = new_row
print(df)

      Name  Age Gender
0     Arya   14      M
1   Dustin   12      M
2      Ron   14      M
3     Emma   13      F
4    Harry   11      M
5       El   12      F
6    Steve   20      M
7     None   20      F
8     Lexi   11      F
9     Lexi   11      F
10    Lexi   11      F


In [27]:
print(len(df))

11


In [30]:
print(df.shape)   # to get no. of rows & cols

(11, 3)


In [19]:
#  groupby() method itself does not produce a printable result directly
x = df.groupby(['Gender'], as_index=False)
print(x) 
# To see meaningful output, you need to perform an aggregation or transformation on the grouped data
x = df.groupby(['Gender'], as_index=False).sum()
print(x) 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E851829A00>
  Gender                     Name  Age
0      F                   EmmaEl   45
1      M  AryaDustinRonHarrySteve   71


In [23]:
x = df.groupby(['Gender']).sum()    # by default as_index=True
print(x)

                           Name  Age
Gender                              
F                        EmmaEl   45
M       AryaDustinRonHarrySteve   71


In [21]:
# print only Gender and Age col in the output
x = df.groupby(['Gender'], as_index=False).sum()
selected_col = x[['Gender','Age']]
print(selected_col) 


  Gender  Age
0      F   45
1      M   71
