In [2]:
data = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]
print(data)
# The list of numbers is good for general data manipulation but it is not optimized for numerical analysis

[50, 50, 47, 97, 49, 3, 53, 42, 26, 74, 82, 62, 37, 15, 70, 27, 36, 35, 48, 52, 63, 64]


In [4]:
import numpy as np

grades = np.array(data)

# Adding a second set of data for the same students - recording the number of hours spent studying

# Define an array of study hours
study_hours = [10.0,11.5,9.0,16.0,9.25,1.0,11.5,9.0,8.5,14.5,15.5,
               13.75,9.0,8.0,15.5,8.0,9.0,6.0,10.0,12.0,12.5,12.0]

# Create a 2D array (an array of arrays)
student_data = np.array([study_hours, grades])

# display the array
student_data

array([[10.  , 11.5 ,  9.  , 16.  ,  9.25,  1.  , 11.5 ,  9.  ,  8.5 ,
        14.5 , 15.5 , 13.75,  9.  ,  8.  , 15.5 ,  8.  ,  9.  ,  6.  ,
        10.  , 12.  , 12.5 , 12.  ],
       [50.  , 50.  , 47.  , 97.  , 49.  ,  3.  , 53.  , 42.  , 26.  ,
        74.  , 82.  , 62.  , 37.  , 15.  , 70.  , 27.  , 36.  , 35.  ,
        48.  , 52.  , 63.  , 64.  ]])

In [5]:
import pandas as pd

df_students = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 
                                     'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
                                     'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
                            'StudyHours':student_data[0],
                            'Grade':student_data[1]})

df_students 

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


#Finding and Filtering Data


In [11]:
# Get the data for index value 5
df_students.loc[5]



Name          Vicky
StudyHours        1
Grade             3
Name: 5, dtype: object

In [10]:
# Get the rows with index values from 0 to 5
df_students.loc[0:5]
# loc returns rows with index label in the list of values from 0 to 5
# loc is used to locate data items based on index values rather than positions

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0


In [13]:
# Get data in the first five rows 
df_students.iloc[0:5]
# Notice that the iloc method returns the rows in the positions included in the range 0 to 5, 
# and since integer ranges don't include the upper-bound value, this includes positions 0, 1, 2, 3, and 4 (five rows)

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0


In [16]:
df_students.iloc[0,[1,2]]

# iloc identifies data values in a DataFrame by position, which extends beyond rows to columns. 
# So for example, you can use it to find the values for the columns in positions 1 and 2 in row 0,

Grade    50
Name: 0, dtype: object

In [21]:
# loc is used to locate data items based on index values rather than positions. 
# In the absence of an explicit index column, the rows in our dataframe are indexed as integer values, but the columns are identified by name

df_students.loc[0,'Grade']

50.0

In [22]:
# Use dataframe filtering 
df_students.loc[df_students['Name']=='Aisha']


# Dataframe query expression
#df_students.query('Name=="Aisha"')

#specify the column name as a named index value (as in the df_students['Name']
#df_students[df_students.Name == 'Aisha']

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


#Loading a DataFrame from a file


In [26]:
df_students = pd.read_csv('grades.csv',delimiter=',',header='infer')
df_students.head()

#df_students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        24 non-null     object 
 1   StudyHours  23 non-null     float64
 2   Grade       22 non-null     float64
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [24]:
# You can use the isnull method to identify which individual values are null
df_students.isnull()

Unnamed: 0,Name,StudyHours,Grade
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [27]:
# we can get the sum of missing values for each column
df_students.isnull().sum()

Name          0
StudyHours    1
Grade         2
dtype: int64

In [28]:
# To see them in context, we can filter the dataframe to include only rows where any of the columns (axis 1 of the DataFrame) are null.
df_students[df_students.isnull().any(axis=1)]


Unnamed: 0,Name,StudyHours,Grade
22,Bill,8.0,
23,Ted,,
