## Series Data Structure

In [None]:
import pandas as pd

In [None]:
students = ['Gwyn', 'Jonas', 'Hillary', 'Cecilia']
pd.Series(students)

0       Gwyn
1      Jonas
2    Hillary
3    Cecilia
dtype: object

In [None]:
numbers= [1, 2, 3, 4, 5]

pd.Series(numbers)

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [None]:
students = ['Gwyn', 'Jonas', None]

pd.Series(students)

0     Gwyn
1    Jonas
2     None
dtype: object

In [None]:
numbers = [1, 2, None]

pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [None]:
import numpy as np

np.nan == None

False

In [None]:
np.nan == np.nan

False

In [None]:
np.isnan(np.nan)

True

In [None]:
student_scores = {'Gwyn' : 'Physics',
                  'Jonas' : 'Math',
                  'Potato': 'English'}
s = pd.Series(student_scores)

print(s)

Gwyn      Physics
Jonas        Math
Potato    English
dtype: object


In [None]:
students = [("Gwyn", "Hotdog"),
            ("Jonas", "Burger"),
            ("Red", "Fries")]
pd.Series(students)

0     (Gwyn, Hotdog)
1    (Jonas, Burger)
2       (Red, Fries)
dtype: object

In [None]:
s = pd.Series(['Physics', 'Math', 'English'],
              index = ['Alice', 'Jack', 'Molly'])
s

Alice    Physics
Jack        Math
Molly    English
dtype: object

In [None]:
student_scores = {'Gwyn' : 'Physics',
                  'Jonas' : 'Math',
                  'Potato': 'English'}
s = pd.Series(student_scores, index = ['Alice', 'Gwyn', 'Potato'])

print(s)

Alice         NaN
Gwyn      Physics
Potato    English
dtype: object


## Querying a Series


In [None]:
import pandas as pd

students_classes = {'Jack': 'Python',
                    'Jonas' : 'Java',
                   'Bear' : 'Dart',
                    'Josh' : 'Ruby'}
s = pd.Series(students_classes)

print(s)

Jack     Python
Jonas      Java
Bear       Dart
Josh       Ruby
dtype: object


In [None]:
# If you wanted to see the fourth entry we would use the iloc attribute with the paramether 3
s.iloc[3]

'Ruby'

In [None]:
s.loc['Bear']

'Dart'

In [None]:
# iloc and loc are not methods, but attributes. so dont use parentheses to query them.
# the '[]' is called indexing operator

In [None]:
s[2]

'Dart'

In [None]:
s['Jonas']

'Java'

In [None]:
# sample using class and their classcode information, where classes are indexed by classcode, in the form of integers

class_code = {99: 'Java',
              100: 'Python',
              101: 'JavaScript',
              102: 'C#',
              103: 'Assembly'}
s = pd.Series(class_code)

print(s)

99           Java
100        Python
101    JavaScript
102            C#
103      Assembly
dtype: object


In [None]:
# s[0] gets a key error because there's no item in the classes list with an index of zero

s[0]

KeyError: 0

In [None]:
s.loc[100]

'Python'

In [None]:
grades = pd.Series([90, 80, 70, 100, 60])

total = 0
for grade in grades:
  total += grade
print(total/len(grades))

80.0


In [None]:
import numpy as np

total = np.sum(grades)
print(total/len(grades))

80.0


In [None]:
numbers= pd.Series(np.random.randint(0,1000,10000))

numbers.head()

0    254
1    689
2    346
3    634
4    495
dtype: int64

In [None]:
len(numbers)

In [None]:
%%timeit -n 100
total = 0
for grade in grades:
  total += grade
total/len(grades)

3.42 µs ± 176 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

60.8 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
numbers.head()

0    254
1    689
2    346
3    634
4    495
dtype: int64

In [None]:
numbers += 2
numbers.head()

0    256
1    691
2    348
3    636
4    497
dtype: int64

In [None]:
for label, value in numbers.iteritems():
  numbers.set_value(label, value+2)
numbers.head()

AttributeError: 'Series' object has no attribute 'iteritems'

In [None]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 1000))

for label, value in s.iteritems():
  s.loc[label] = value+2

AttributeError: 'Series' object has no attribute 'iteritems'

In [None]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 1000))
s+=2

323 µs ± 155 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
s = pd.Series([1, 2, 3, 4])
s.loc['C#'] = 102

s

0       1
1       2
2       3
3       4
C#    102
dtype: int64

In [None]:
student_class = pd.Series ({'Jonas' : 'Physics',
                            'Jack' : 'Chemistry',
                            'Molly' : 'English',
                            'Sam' : 'History'})
student_class

Jonas      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [None]:
gwyn_class = pd.Series(['Logic', 'Arts', 'Biology'], index = ['Gwynn','Gwynn','Gwynn '])

gwyn_class

Gwynn       Logic
Gwynn        Arts
Gwynn     Biology
dtype: object

In [None]:
all_students = pd.concat([student_class, gwyn_class])
print(all_students)

Jonas       Physics
Jack      Chemistry
Molly       English
Sam         History
Gwynn         Logic
Gwynn          Arts
Gwynn       Biology
dtype: object


In [None]:
student_class

Jonas      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [None]:
all_students.loc['Gwynn']

Gwynn    Logic
Gwynn     Arts
dtype: object

## DataFrame Data Structure


In [None]:
import pandas as pd


In [None]:
record1 = pd.Series({'Name': 'Gwyn',
                     'Class': 'Logic',
                     'Score' : 'A'})
record2 = pd.Series({'Name': 'Jonas',
                     'Class': 'Arts',
                     'Score' : 'B'})
record3 = pd.Series({'Name': 'Potato',
                     'Class': 'Biology',
                     'Score' : 'C'})

In [None]:
df = pd.DataFrame([record1, record2, record3], index = ['student1', 'student2', 'student3'])
df.head()

Unnamed: 0,Name,Class,Score
student1,Gwyn,Logic,A
student2,Jonas,Arts,B
student3,Potato,Biology,C


In [None]:
#alternative method you can use a list of dictionaries, where each dictionary represents a row of data

students = [{'Name': 'Kyla',
                     'Class': 'Logic',
                     'Score' : '90'},
                      {'Name': 'Ding',
                     'Class': 'Arts',
                     'Score' : '89'},
                      {'Name': 'Muffin',
                     'Class': 'Biology',
                     'Score' : '96'}]

df = pd. DataFrame(students, index = ['student1', 'student2', 'student3'])
df.head()


Unnamed: 0,Name,Class,Score
student1,Kyla,Logic,90
student2,Ding,Arts,89
student3,Muffin,Biology,96


In [None]:
df.loc['student2']

Name     Ding
Class    Arts
Score      89
Name: student2, dtype: object

In [None]:
type(df.loc ['student2'])

In [None]:
df.loc['student1', 'Name']

'Kyla'

In [None]:
df.T

Unnamed: 0,student1,student2,student3
Name,Kyla,Ding,Muffin
Class,Logic,Arts,Biology
Score,90,89,96


In [None]:
df.T.loc['Name']

student1      Kyla
student2      Ding
student3    Muffin
Name: Name, dtype: object

In [None]:
df['Name']

student1      Kyla
student2      Ding
student3    Muffin
Name: Name, dtype: object

In [None]:
df.loc['Name']

KeyError: 'Name'

In [None]:
type(df['Name'])

In [None]:
df.loc['student1']['Name']

'Kyla'

In [None]:
print(type(df.loc['student1']))
print(type(df.loc['student1']['Name']))

<class 'pandas.core.series.Series'>
<class 'str'>


In [None]:
# Example: we ask for all the names and scores for all schools using the .loc operator.

df.loc[:, ['Name', 'Score']]

Unnamed: 0,Name,Score
student1,Kyla,90
student2,Ding,89
student3,Muffin,96


In [None]:
# The colon means that we want to get all of the rows, and the list in the second arguement
# position is the list of column we want to get back

In [None]:
# Dropping data

df.drop('student1')

Unnamed: 0,Name,Class,Score
student2,Ding,Arts,89
student3,Muffin,Biology,96


In [None]:
df

Unnamed: 0,Name,Class,Score
student1,Kyla,Logic,90
student2,Ding,Arts,89
student3,Muffin,Biology,96


In [None]:
copy_df = df.copy()
copy_df.drop('Name', inplace= True, axis = 1)
copy_df

Unnamed: 0,Class,Score
student1,Logic,90
student2,Arts,89
student3,Biology,96


In [None]:
del copy_df['Class']
copy_df

Unnamed: 0,Score
student1,90
student2,89
student3,96


In [None]:
# Adding column

df['Ranking'] = 1
df

Unnamed: 0,Name,Class,Score,Ranking
student1,Kyla,Logic,90,1
student2,Ding,Arts,89,1
student3,Muffin,Biology,96,1


## DataFrama Indexing and Loading

In [None]:
!cat datasets/Admission_Predict.csv

cat: datasets/Admission_Predict.csv: No such file or directory


In [None]:
import pandas as pd

df = pd.read_csv('datasets/Admission_Predict.csv')
df.head()

In [None]:
df = pd.read_csv('datasets/Admission_Predict.csv', index_col = 0)
df.head()

In [None]:
new_df=df.rename(columns={'GRE Score' : 'GRE Score', 'TOEFL Score': 'TOEFL Score',
                          'University Rating': 'University Rating', 'SOP' : 'Statement of Purpose',
                          'LOR' : 'Letter of Recommendation', 'CGPA' : 'CGPA', 'Research':'Research',
                          'Chance of Admit': 'Chance of Admit'})
new_df.head()

NameError: name 'df' is not defined

In [None]:
new_df.columns

In [None]:
new_df = new_df.rename(columns = {'LOR ': 'Letter of Recommendation'})
new_df.head()

In [None]:
new_df= new_rename(mapper = str.strip, axis= 'column')

new_df.head()

In [None]:
df.columns

In [None]:
cols = list(df.columns)
cols = [x.lower().strip() for x in cols]

df.columns = cols
df.head()

## Querying a DataFrame

In [None]:
import pandas as pd

df = pd.read_csv('datasets/Admission_Predict.csv', index_col = 0)
df.columns = [x.lower().strip() for x in df.columns]

df.head()

In [None]:
admit_mask = df['chance of admit'] > 0.7
admit_mask

In [None]:
df.where(admit_mask).head()

In [None]:
df.where(admit_mask).dropna().head()

In [None]:
df[df['chance of admit'] > 0.7].head()

In [None]:
df["gre score"].head()

In [None]:
df[["gre score", "toefl score"]].head()

In [None]:
df[df["gre score"] > 320].head()

In [None]:
(df['chance of admit'] > 0.7) and (df['chance of admit'] < 0.9)

In [None]:
(df['chance of admit'] > 0.7) & (df['chance of admit'] < 0.9)

In [None]:
df['chance of admit'].gt(0.7) & df['chance of admit'].lt(0.9)

In [None]:
df['chance of admit'].gt(0.7).lt(0.9)

## Indexing DataFrame

In [None]:
import pandas as pd

df= pd.read_csv('datasets/Admission_Predict.csv', index_col = 0)
df.head()

In [None]:
df['Serial Number'] = df.index
df = df.set_index('Chance of Admit ')
df.head()

In [None]:
df = df.reset_index()
df.head()

In [None]:
df = pd.read_csv('datasets/census.csv')
df.head()

In [None]:
df['SUMLEV'].unique()

In [None]:
df=df[df['SUMLEV'] == 50]
df.head()

In [None]:
columns_to_keep = ['STNAME', 'CTYNAME', 'BIRTHS2010']

df = df[columns_to_keep]
df.head()

In [None]:
df =df.set_index(['STNAME', 'CTYNAME'])
df.head()

In [None]:
df.loc['Michigan', 'Washtenaw County']

In [None]:
df.loc[[('Michigan', 'Washtenaw County'),
        ('Michigan', 'Wayne County')]]

## Missing Values

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('datasets/class.csv')
df.head(10)

In [None]:
mask=df.isnull()
mask.head(10)

In [None]:
df.dropna().head(10)

In [None]:
df.fillna(0, inplace=True)
df.head(10)

In [None]:
df = pd.read_csv('datasets/class_grades.csv')
df.head(20)

In [None]:
df = df.set_index('time')
df = df.sort_index()
df.head(20)

In [None]:
df = df.reset_index()
df = df.set_index(['time', 'student'])
df

In [None]:
df = df.fillna(method = 'ffill')
df.head()

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['a', 'b', 'c']})
df

Unnamed: 0,A,B,C
0,1,4,a
1,2,5,b
2,3,6,c


In [None]:
df.replace([1,3], [100, 300])

Unnamed: 0,A,B,C
0,100,4,a
1,2,5,b
2,300,6,c


In [None]:
df = pd.read_csv('datasets/class_grades.csv')
df.head(20)

In [None]:
df.replace(to_replace= ".*.html$", value= 'webpage', regex=True)

## Manipulating DataFrame

In [None]:
import pandas as pd

df = read_csv('datasets/census.csv')
df.head()

In [None]:
df["First"]= df['President']
df["Forst"] = df["First"].replace("[ ].*", "", regex=True)

df.head()

In [None]:
del(df["First"])

def splitname(row):
  row['First'] = row['President'].split()[0]
  row['Last'] = row['President'].split()[-1]
  return row

df = df.apply(splitname, axis= 'columns')
df.head()

In [None]:
del(df['First'])
del(df['Last'])

pattern="(^[w]*)(?:.* )([\w]* $)"

df["President"] = df["President"].str.extract(pattern).head()