### This is an Introductory notebook for Pandas library and Dataframes. In this notebook you can be familiar with: 
- How to store and manipulate single dimensional indexed data in the Series object
- Querying a Series
- DataFrame Data Structure



In [1]:
import pandas as pd 


In [2]:
# List of three students
students = ['Alice', 'Jack', 'Molly']

# Call the Series function in pandas and pass in the students list
pd.Series(students)


0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
# Values are indexed with integers. so we have two columns, one to be the index
# and the other is to be the data provided. the type is set to object.

# We also dont have to use strings. If we use whole numbers, then the type will be 
# set to int64. The Pandas library is built on the Numpy library

# Create a list of numbers
numbers = [1,2,3]

pd.Series(numbers)


0    1
1    2
2    3
dtype: int64

In [4]:
# Below is an explanation of how Pandas and Numpy deal with missing data

# In python, we have the none type to indicate a lack of data. If we have
# a list of string and one element is a None type, pandas inserts it as a None and uses the type object
# for the underlying array. 

# Difference between None and NaN: NaN can be used as a numerical value,
# whereas None is similar to "inexistent" or "empty" 

students = ['Alice', 'Jack', None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [5]:
# NaN example
numbers = [1,2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
# Pandas represents NaN as a floating point number, and because integers can be typecast to floats,
# Pandas wnet and converted our integers to floats. 

# NaN is not equivalent to None


import numpy as np

np.nan == None

False

In [7]:
np.nan == None

False

In [10]:
# More about pandas series
# Series can be created directly from dictionary data.

students_scores = {
    'Alice':'Physics',
    'Jack': 'Chemistry',
    'Molly': 'English'
}

s = pd.Series(students_scores)
print(s)

# Print the index of the Series
print(s.index)

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object
Index(['Alice', 'Jack', 'Molly'], dtype='object')


In [12]:
# More complex data. list of tuples
students = [('Alice','Brown'), ('Jack', 'White'), ('Molly', 'Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [14]:
# You can also separate you index creation from the data by passing in the index as a
# list explicitly to the series
s = pd.Series (['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [15]:
# What happens if your list of values in the index object are not 
# aligned with the keys in your dictionary for creating the series? 
# Pandas overrides the automatic creation to favor only and all of the indices values that you provided. 
# So it will ignore frmo your dictionary all keys which are not in your index, 
# and pandas will add None or NaN type values for any index value you provide, which is not in your dict list

# Example
students_scores = {'Alice': 'Physics',
                   'Jack' : 'Chemistry',
                   'Molly': 'English'}

s = pd.Series(students_scores, index=['Alice', 'Molly' ,'Sam'])

print(s)

Alice    Physics
Molly    English
Sam          NaN
dtype: object


### Querying a Series

In [16]:
# A pandas Series can be queried either by the index position or the index label.
# To query by the index label, you can use the loc attribute

students_scores = {'Alice': 'Physics',
                   'Jack' : 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}

s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [17]:
# To query the fourth entry:
s.iloc[3]

'History'

In [18]:
# Tp query Molly's 
s.loc['Molly']

'English'

In [19]:
# iloc and loc are not methods, they are attributes. So you don't use parentheses
# to query them, but square brackets instead, which is called the indexing operator.

# You can also do this instead of using iloc and loc

print(s[3])
print(s['Molly'])

History
English


In [20]:
# What happens if your index is a list of integers? It is best to use the loc and iloc directly

# Example where classes are indexed by class codes 
class_code = {99: 'Physics',
              100: 'Chemistry',
               101: 'English', 
                102: 'History'}

s = pd.Series(class_code)

# if we try to call s[0]
s[0]

KeyError: 0

In [22]:
# But if we call the item using iloc
s.iloc[1]


'Chemistry'

In [23]:
# Get data out of the series
# An approach is to iterate over the items in the series, and invoke the operation
# one is interested in.

grades = pd.Series([90,80,70,60])

total = 0
for grade in grades:
    total+= grade
print(total/len(grades))

75.0


In [24]:
# This works but it's slow when dealing with large data because of the iteration.
# It's best to use vectorization

total = np.sum(grades)
print(total/len(grades))

75.0


In [31]:
# Appending series together
# A pandas Series can be queried either by the index position or the index label.
# To query by the index label, you can use the loc attribute

import pandas as pd

students_classes = pd.Series({'Alice': 'Physics',
                   'Jack' : 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})

kelly_classes = pd.Series(['Philosophy', 'Arts'], index = ['Kelly', 'Kelly'])
all_students = pd.concat([students_classes, kelly_classes])
print(all_students)


Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
dtype: object


## DataFrame and Data Structure

In [33]:
# Example
record1 = pd.Series({
            'Name' : 'Alice',
            'CLass':'Physics',
            'Score': 85
})

record2 = pd.Series({
            'Name' : 'Jack',
            'CLass':'Chemistry',
            'Score': 82
})

record3 = pd.Series({
            'Name' : 'Helen',
            'CLass':'Biology',
            'Score': 90
})

In [34]:
# Like a Series, the DF object is index. 
df = pd.DataFrame([record1, record2, record3], index=['School1', 'School2', 'School1'])

df.head()

Unnamed: 0,Name,CLass,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [36]:
# Outputs the index, similar to Series and the rest of the content

# You can also do it in alternative way that uses a list of dictionaries

students = [
            {'Name' : 'Alice',
            'CLass': 'Physics',
            'Score': 85},
            {'Name' : 'Jack',
            'CLass':'Chemistry',
            'Score':82},
            {'Name' : 'Helen',
            'CLass':'Biology',
            'Score':90}
            ]

# Pass this list of dictionaries into the DataFrame function
df = pd.DataFrame(students, index=['school1', 'school2', 'school1'])
df.head()

Unnamed: 0,Name,CLass,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [39]:
# You can select data associated with school2, we would just query 
# the .loc attribute with one parameter
print(df.loc['school2'])

# You can also check the data type of the return
type(df.loc['school2'])

Name          Jack
CLass    Chemistry
Score           82
Name: school2, dtype: object


pandas.core.series.Series

In [40]:
# Lets try this with school1 
print(df.loc['school1'])

          Name    CLass  Score
school1  Alice  Physics     85
school1  Helen  Biology     90


In [41]:
# If we are only interested in shcool1's student names
df.loc['school1', 'Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [43]:
# What do we do if we want to select a single column through
# Can do it in multiple ways. But firstly, we could transpose the matrix.
# This pivots all of the rows into columns and all the columns intro rows
df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
CLass,Physics,Chemistry,Biology
Score,85,82,90


In [44]:
# Then we can call .loc on the transpose to get the student names only
df.T.loc['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [47]:
# You could also choose the column by calling the dataframe with the label of the column in the brackets
print(df['Name'])

#df.loc['Name'] does not work bc it is used for rows only

type(df['Name'])

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object


pandas.core.series.Series

In [50]:
# You can specify the rows you want and the specific column requested
print(df.loc['school1']['Name'])  # This yields a Series
print(df.loc['school1']) # This yields a DataFrame

school1    Alice
school1    Helen
Name: Name, dtype: object
          Name    CLass  Score
school1  Alice  Physics     85
school1  Helen  Biology     90


In [53]:
# If we want to select all rows, we can use a colon to indicate a full slice

# Example, ask for all names and scores for all shcools
print(df.loc[:,['Name', 'Score']]) # rows, columns

          Name  Score
school1  Alice     85
school2   Jack     82
school1  Helen     90


In [57]:
# To drop columns in DataFrames, we can use the drop attribute of pandas df

df.drop('school1') # drop rows with school1

Unnamed: 0,Name,CLass,Score
school2,Jack,Chemistry,82


In [58]:
df

Unnamed: 0,Name,CLass,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [60]:
# Drop has two optional parameters, The first is called inplace, 
# and if i's set to true, the DataFrame will be updated in place, instead of a copy being returned
# The second parameter is the axes, which should be dropped.By default, this value is 0
# indicating the row axis, But if you want to drop a column, then specify axis to be 1

copy_df = df.copy()

# Drop the name column
copy_df.drop('Name', inplace=True, axis=1)
copy_df

Unnamed: 0,CLass,Score
school1,Physics,85
school2,Chemistry,82
school1,Biology,90


In [63]:
# The second way to drop a column, and that's directly through the use of indexing
# operator, using the del keyword. This way of dropping data, however, takes immediate
# effect ont he DataFrame and does not return a view
del copy_df['CLass']
copy_df

Unnamed: 0,Score
school1,85
school2,82
school1,90


In [64]:
# To add a column

df['ClassRanking'] = None
df

Unnamed: 0,Name,CLass,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school1,Helen,Biology,90,
